def reply_to_tweets(): print('retrieving and replying to tweets...', flush=True) # id last seen testing : 1112619279025725441 1124521036538568704 url = "http://188.166.216.148:4545/" response = requests.request("GET", url) last_seen_id = response.text mentions = api.mentions_timeline( last_seen_id, tweet_mode='extended') print('last seen id : ', last_seen_id) # print(mentions) for mention in reversed(mentions): #supaya membaca perulangannya terbalik.. #biasanya list mention di timline dibaca dari yg terakhir dahulu, supaya terurut, maka dibalik print(str(mention.id) + ' - ' + mention.full_text + ' - ' +str(mention.in_reply_to_screen_name), flush=True) last_seen_id = str(mention.id) store_last_seen_id(last_seen_id) tweet = mention.full_text tlow = tweet.lower() # print(tlow) city = re.findall(r"#(\w+)", tlow) translate = Translate() ans = translate.kamusDaerah(city) print(ans) # if '#hai' in mention.full_text.lower(): if ans != "no" and mention.in_reply_to_screen_name != 'cuaca_kita': print('menemukan tweet yang harus dibalas!', flush=True)
def Vacancy_info(url): print(url) page = requests.get(url) # /html/body/table[2]/tbody/tr/td[2]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # Description try: description = Selector(response=page).xpath( '/html/body/table[2]/tr/td[2]/div/table/tr[2]/td[2]/table/tr/td/div[6]' ).get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() # description = re.sub(r"\s+", " ", description) except: description = "" if description is None: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description) email = email[0] except: email = "" data = { "description_ka": description_ka, "description_ru": description_ru, "description_en": description_en, "email": email } print("Info Scraped Successfully") return data # //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # //*[@id="CenterBody1"]/div/table/tbody/tr[2]/td[2]/table/tbody/tr/td/div/div[7] # Vacancy_info('https://jobs24.ge/?act=obj&id=173982&PHPSESSID=tf04s8ucsd5trehbc1qouk90f25tnqma')
def Vacancy_info(url): url = url.replace("/en/", "/ge/") print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath('//*[@id="job"]/table/tr[1]/td/table[2]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = description.replace('*', "") description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description) email = email[0] except: email = "" data = { "description_ka" : description_ka, "description_ru" : description_ru, "description_en" : description_en, "email" : email } return data # Vacancy_info("https://jobs.ge/en/?view=jobs&id=268715")
def reply_to_tweets(): print('retrieving and replying to tweets...', flush=True) # id last seen testing : 1112619279025725441 url = "http://188.166.216.148:4545/" response = requests.request("GET", url) last_seen_id = response.text mentions = api.mentions_timeline(last_seen_id, tweet_mode='extended') print('last seen id : ', last_seen_id) for mention in reversed( mentions): #supaya membaca perulangannya terbalik.. #biasanya list mention di timline dibaca dari yg terakhir dahulu, supaya terurut, maka dibalik print(str(mention.id) + ' - ' + mention.full_text, flush=True) last_seen_id = str(mention.id) store_last_seen_id(last_seen_id) tweet = mention.full_text tlow = tweet.lower() city = re.findall(r"#(\w+)", tlow) translate = Translate() ans = translate.kamusDaerah(city) print(ans) # if '#hai' in mention.full_text.lower(): if ans != "no": print('menemukan tweet yang harus dibalas!', flush=True) text_gen = Text_generator() data = text_gen.getCData(ans) sentence = text_gen.generator(data) print('respond tweet...', flush=True) api.update_status( 'Hai! @' + mention.user.screen_name + " " + sentence, mention.id) elif city != []: api.update_status( 'Hai! @' + mention.user.screen_name + " sayang sekali Cuki hanya dibuat untuk kamu yang ingin tau prakiraan cuaca di Indonesia saja\n\nAyo coba yang lain", mention.id)
def __init__(self, proxies={}, entropy_filter=True, lang='en', entropy_top=3, query_top=100, fraction=5): self.__ngd = NGD(proxies) #self.__ngd.set_context('site:imsdb.com') self.__cache = {} self.__min_ent = 0.0 self.__entropy_filter = entropy_filter self.__lang = lang self.__entropy_top = entropy_top self.__fraction = fraction self.__query_top = query_top self.__translator = Translate() self.__lock = Lock() self.__voc_translator = None random.seed(666)
def Vacancy(link): url = link page = requests.get(url) # Description try: description = Selector(response=page).xpath('/html/body/div[3]/div[1]/div/div/div[1]/div[1]').get() description = remove_tags(description) if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_az = description else: description_en = description description_az = "" except: description_en = "" description_az = "" # email try: driver.get(url) email = driver.find_element_by_xpath('/html/body/div[3]/div[1]/div/div/div[1]/div[1]').text email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except: email = [] data = { "description_az" : description_az, "description_en" : description_en, "email" : email } # print(data) return data # Vacancy("https://azinka.az/jobs/3710/")
def Vacancy(link): url = link headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36", "Accept-Language": "en-US,en;q=0.9,ru;q=0.8" } page = requests.get(url, headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/h4/text()').get() except: company = "" # position try: position = Selector(response=page).xpath( '//*[@id="loyal"]/div[2]/div/div[1]/h4/text()').get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/img/@src').get() except: logo = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[3]/div/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]//text()[2]' ).get() job_type = job_type.strip() except: job_type = "" # Contact Person try: person = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[2]').get( ) person = person.strip() except: person = "" # Email try: email = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[3]').get( ) email = email.strip() email = [email] except: email = [] # Phone try: phone = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[4]').get( ) phone = phone.strip() if "," in phone: phones = phone.split(",") phone = [] for each in phones: each = each.strip() if "+" in each and " " in each: number = each.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = each.split(" ", 1)[0].replace('+', "") phone.append({ "country_code": country_code, "number": number }) elif "+" in each and " " not in each: if "+374" in each: country_code = "374" number = each.replace("+374", "") phone.append({ "country_code": country_code, "number": number }) elif "+1" in each: country_code = "1" number = each.replace("+1", "") phone.append({ "country_code": country_code, "number": number }) else: country_code = "374" number = each phone.append({ "country_code": country_code, "number": number }) elif "+" not in each: number = each.replace('-', "").replace(" ", "") country_code = "374" phone.append({ "country_code": country_code, "number": number }) else: if "+" in phone and " " in phone: number = phone.split(" ", 1)[1].replace('-', "").replace(" ", "") country_code = phone.split(" ", 1)[0].replace('+', "") phone = [{"country_code": country_code, "number": number}] elif "+" in phone and " " not in phone: if "+374" in phone: country_code = "374" number = phone.replace("+374", "") phone = [{"country_code": country_code, "number": number}] elif "+1" in phone: country_code = "1" number = phone.replace("+1", "") phone = [{"country_code": country_code, "number": number}] else: country_code = "374" number = phone phone = [{"country_code": country_code, "number": number}] elif "+" not in phone: number = phone.replace('-', "").replace(" ", "") country_code = "374" phone = [{"country_code": country_code, "number": number}] except Exception as e: phone = [] # Website try: website = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[2]/div/text()[5]').get( ) website = website.strip() if "not" in website: website = [] else: website = [website] except: website = [] # Published try: published = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[2]').get() published = published.strip() publish_day = int(published.split("-")[2]) publish_month = int(published.split("-")[1]) publish_year = int(published.split("-")[0]) except: publish_day = 0 publish_month = 0 publish_year = 0 # Ends try: ends = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/text()[5]').get() ends = ends.strip() deadline_day = int(ends.split("-")[2]) deadline_month = int(ends.split("-")[1]) deadline_year = int(ends.split("-")[0]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Career Level try: career_level = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[1]/text()').get( ) if career_level == None: career_level = "" except: career_level = "" # Education try: education = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[2]/text()').get( ) if education == None: education = "" except: education = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/span[3]/text()').get( ) if experience == None: experience = "" except: experience = "" # Salary try: salary = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[1]/div[2]/strong/text()').get( ) if "-" in salary: salary = salary.split("-") min_salary = salary[0].strip() min_salary = int(min_salary.replace(".", "")) max_salary = salary[1].strip() max_salary = int(max_salary.replace('.', "")) elif "-" not in salary and salary != "N/A": min_salary = int(salary.replace(".")) max_salary = int(salary.replace(".")) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Vacancy Description try: v_description = Selector( response=page).xpath('//*[@id="loyal"]/div[2]/div/div[1]').get() v_description = remove_tags(v_description).strip() v_description = v_description.replace('\xa0', " ") except: v_description = "" try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = " " v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_am = "" v_description_en = "" # Company Description try: c_description = Selector(response=page).xpath( '//*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text()').get() c_description = c_description.strip() except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = " " c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_am = "" c_description_en = "" # c_descrip ; //*[@id="loyal"]/div[1]/div[2]/div[2]/div[1]/p/text() data = { "company": company, "position": position, "logo": logo, "person": person, "job_type": job_type, "email": email, "phone": phone, "website": website, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "career_level": career_level, "education": education, "experience": experience, "min_salary": min_salary, "max_salary": max_salary, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, } print(data) return data # Vacancy("https://rezume.am/job/2184")
#!/usr/bin/env python3 import os from translator import Translate messageWelcome = "Pig Latin" messageGoodbye = "Goodbye! :)" # Clear display os.system('cls') os.system('clear') # Print welcome message print(messageWelcome + ": " + Translate(messageWelcome)) print("\nTo finish translating, enter 'quit'.\n") done = False while not done: text = input("Text to translate: ") done = text.lower() in ['quit', 'itquay'] # User quitting? print(Translate(text) + '\n') print(Translate(messageGoodbye))
import requests import json # import numpy as np from translator import Translate from check import Check import time translate = Translate() check = Check() class RequestAPI: def get_by_city(self, c): self.url = "http://api.openweathermap.org/data/2.5/forecast" self.querystring = {"appid": "2651c986fc0256f04e92c5a71d08e870"} self.querystring['q'] = c response = requests.request("GET", self.url, params=self.querystring) # print("Mengumpulkan data cuaca kota", c) print("\r" + "Berhasil mengumpulkan data cuaca kota " + c + ".....") return json.loads(response.text) def get_by_coordinate(self, lat, lon): self.url = "http://api.openweathermap.org/data/2.5/forecast" self.querystring = {"appid": "2651c986fc0256f04e92c5a71d08e870"} self.querystring['lat'] = lat self.querystring['lon'] = lon response = requests.request("GET", self.url, params=self.querystring) return json.loads(response.text) def get_sys(self, c): self.url = "http://api.openweathermap.org/data/2.5/weather"
def get_this_month_in_unicode(self): this_month_in_cyrillic = self.get_this_month_in_cyrillic() return Translate( cyrillic=this_month_in_cyrillic).translate_cyrillic_to_unicode()
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Published try: published = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[2]/span/text()[2]' ).get() published = published.strip().split(" ") publish_day = int(published[0].split("/")[0]) publish_month = int(published[0].split("/")[1]) publish_year = int(published[0].split("/")[2]) except Exception as e: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Location # try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/ul/li[1]/text()' ).get() location = location.strip() location_id = [] location = {"city": f"{location}", "id": f"{Geonames(location)}"} location_id.append(location) except: location_id = [{'city': 'Yerevan', 'id': '616052'}] # Posted by try: posted_by = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[1]/text()' ).get() posted_by = posted_by.strip() except: posted_by = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/div[1]/div[2]/p[2]/text()' ).get() email = email.strip() if email == "": email = [] else: email = [email] except: email = [] # Workspace try: workspace = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[2]/div[2]/div[2]/p/text()' ).get() workspace = workspace.strip() except: workspace = "" # Job_type try: job_type = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[3]/div[2]/div[2]/p/text()' ).get() job_type = job_type.strip() except: job_type = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[4]/div[2]/div[2]/p/text()' ).get() salary = salary.strip().replace("Until ", "") if "-" in salary: salary = salary.split("-") min_salary = int(salary[0].strip()) max_salary = int(salary[1].strip()) elif "-" not in salary and salary != '': min_salary = int(salary) max_salary = int(salary) else: min_salary = 0 max_salary = 0 except: min_salary = 0 max_salary = 0 # Education try: education = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[5]/div[2]/div[2]/p/text()' ).get() education = education.strip() except: education = "" # Experience try: experience = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[6]/div[2]/div[2]/p/text()' ).get() experience = experience.strip() except: experience = "" # Gender try: gender = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[7]/div[2]/div[2]/p/i/@class' ).get() if "female" in gender: gender = "female" elif "male" in gender: gender = "male" else: gender = '' except: gender = "" # Age try: age = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[8]/div[2]/div[2]/p/text()' ).get() age = age.strip() except: age = "" print(1) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[2]/div/div/div[1]/div[2]/ul/li[10]/div[2]/div/p/text()' ).get() description = description.strip() except: description = "" description_en = "" description_am = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Phone try: phone = Selector(response=page).css( '#sidebar-border > div.detailed-info-block.form-inline.clearfix > div.clearfix > div > div.user-details' ).extract() phones = [] for phone in phone: phone = remove_tags(phone).strip() area_code = "374" number = phone.replace(" ", "") number = number.replace("-", "") number = number.replace("(", "") number = number.replace(")", "") phones.append({'country_code': area_code, "number": number}) except: phone = [] # Username try: username = Selector(response=page).xpath( '//*[@id="sidebar-border"]/div[1]/div[1]/div/div[1]/div[2]/div[1]/div[2]/h6/a/text()' ).get() username = username.strip() except: username = "" data = { "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "location_id": location_id, "posted_by": posted_by, "email": email, "workspace": workspace, "job_type": job_type, "min_salary": min_salary, "max_salary": max_salary, "education": education, "experience": experience, "gender": gender, "age": age, "description_am": description_am, "description_en": description_en, "phone": phones, "username": username } print(data) return data # Vacancy("https://full.am/en/job/public/view/1163") # https://full.am/en/job/public/view/12067 # https://full.am/en/job/public/view/1163
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) page = requests.get(url) #headers=headers) # Description try: description = Selector( response=page).xpath('/html/body/section[2]/div[3]').get() description = remove_tags(description) except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Company Link try: c_link = Selector(response=page).xpath( '/html/body/section[2]/section/div[1]/div[2]/a/@href').get() c_link = "http://hr.am" + c_link except: c_link = "" # Email try: driver.get(c_link) email = driver.find_element_by_xpath( '/html/body/section[2]/div[10]/div[1]/a').get_attribute("href") email = email.replace('mailto:', "") email = [email] except: email = [] if email == []: try: email = email = re.findall(r'[\w\.-]+@[\w\.-]+', description) except: email = [] # Phone try: phone = re.search(r"\d{9}", v_description_en).group() phone = [{"country_code": "374", "number": phone}] except: phone = [] data = { "description_en": description_en, "description_am": description_am, "c_link": c_link, "email": email, "phone": phone } print(data) return data # Vacancy('http://hr.am/vacancy/view/vid/73244/t/')
def __init__(self): self.engToPerDic = { "False": "غلط", "None": "هیچی", "True": "درست", "and": "و", "as": "بعنوان", "assert": "تاکید", "break": "خارج", "class": "کلاس", "continue": "ادامه", "def": "تعریف", "del": "حذف", "elif": "شرط_دیگر", "else": "درغیراینصورت", "except": "بجز", "finally": "دراخر", "for": "برای", "from": "از", "global": "جهانی", "if": "اگر", "import": "واردکن", "in": "داخل", "is": "هست", "lambda": "لاندا", "nonlocal": "غیرمحلی", "not": "نفی", "or": "یا", "pass": "******", "raise": "زیادکن", "return": "برگردان", "try": "ازمون", "while": "تازمانیکه", "with": "با", "yield": "واگذارکن", "self": "خود", "print": "چاپ" } self.perToEndDic = { "غلط": "False", "هیچی": "None", "درست": "True", "و": "and", "بعنوان": "as", "تاکید": "assert", "خارج": "break", "کلاس": "class", "ادامه": "continue", "تعریف": "def", "حذف": "del", "شرط_دیگر": "elif", "درغیراینصورت": "else", "بجز": "except", "دراخر": "finally", "برای": "for", "از": "from", "جهانی": "global", "اگر": "if", "واردکن": "import", "داخل": "in", "هست": "is", "لاندا": "lambda", "غیرمحلی": "nonlocal", "نفی": "not", "یا": "or", "بگذر": "pass", "زیادکن": "raise", "برگردان": "return", "ازمون": "try", "تازمانیکه": "while", "با": "with", "واگذارکن": "yield", "خود": "self", "چاپ": "print" } self.translator = Translate()
def Vacancy(link): url = link page = requests.get(url) # Age try: age = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[1]/ul/li[2]/div[2]/text()' ).get() except: age = "" # Education try: education = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[1]/ul/li[3]/div[2]/text()' ).get() except: education = "" # Experience try: experience = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[1]/ul/li[4]/div[2]/text()' ).get() except: experience = "" # Published try: published = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[1]/ul/li[5]/div[2]/text()' ).get() published = published.split(",") publish_day = int(published[0].split(" ")[1]) publish_month = int(months[published[0].split(" ")[0]]) publish_year = int(published[1].strip()) except Exception as e: publish_day = e publish_month = 0 publish_year = 0 # Ends try: ends = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[1]/ul/li[6]/div[2]/text()' ).get() ends = ends.split(",") deadline_day = int(ends[0].split(" ")[1]) deadline_month = int(months[ends[0].split(" ")[0]]) deadline_year = int(ends[1].strip()) except Exception as e: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Phone try: phone = Selector(response=page).xpath( '/html/body/div[3]/div[1]/div[5]/div/div[2]/ul/li[1]/div[2]').get( ) phone = remove_tags(phone) phone = phone.split("(") if len(phone) == 2: phone = phone[1].replace(")", "") phone = phone.replace("-", "") phone = phone.replace(" ", "") phones = [{"country_code": "994", "number": phone}] elif len(phone) == 3: phones = [] number1 = phone[1].replace(")", "") number1 = number1.replace("-", "") number1 = number1.replace(" ", "") phones.append({"country_code": "994", "number": number1}) number2 = phone[2].replace(")", "") number2 = number2.replace("-", "") number2 = number2.replace(" ", "") phones.append({"country_code": "994", "number": number2}) else: phones = [] except Exception as e: phones = [] # Email try: driver.get(url) email = driver.find_element_by_xpath( '/html/body/div[3]/div[1]/div[5]/div/div[2]/ul/li[2]/div[2]/a' ).text email = [email] except: email = [] # Description try: description = Selector( response=page).xpath('/html/body/div[3]/div[1]/div[6]').get() description = remove_tags(description) except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_az = description else: description_en = description description_az = "" except: description_en = "" description_az = "" data = { "age": age, "education": education, "experience": experience, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "phone": phones, "email": email, "description_az": description_az, "description_en": description_en, } # print(data) return data # Vacancy('https://boss.az/vacancies/161045')
def Vacancy(link, cookies): print("request sent for Vacancy succesfully") url = link print(url) cookies = { "Cookie": cookies } headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36"} page = requests.get(url, cookies=cookies) # Stack try: stack = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[1]/span[2]/text()').get() except: stack = "" # Education try: education = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Education:")]').get() education = education.split("</strong>")[1] education = education.split("</span>")[0].strip() except: education = "" # Languages try: languages = Selector(response=page).xpath('//*[@id="page"]/div/div/div/div/div/div[3]/span[contains(.,"Languages:")]').get() languages = languages.split("</strong>")[1] languages = languages.split("</span>")[0].strip() except: languages = "" # Email try: email = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[2]/div/div/span/a/text()').get() except: email = "" if email is None: email = "" # Logo try: logo = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[2]/div/aside[3]/div/div/figure/img/@src').get() except: logo = "" if logo is None: logo = "" # Description try: description = Selector(response=page).xpath('//*[@id="page"]/main/div/div/div[1]/div[1]/article/div').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = description.replace('*', "") description = re.sub(r"\s+", " ", description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # contains(text(),"STODOLINK") data = { "stack" : stack, "education" : education, "languages" : languages, "email" : email, "logo" : logo, "description" : description, "description_ka" : description_ka, "description_ru" : description_ru, "description_en" : description_en } print("Vacancy scraped succesfully") # print(data) return data # Vacancy('_ga=GA1.2.2101960191.1593693483; _gid=GA1.2.453973920.1593693483; WSID=dnh4xi0r4g1qhrtdiygzn241; __RequestVerificationToken=-ZO3RUnIkifRk6Z-oYnkY1BO7sljzPZhydaRlB23lP0PyUlYkuV0iw3TrkEAsMFrOxCONP1xxAIZh8qzX2tzB_D5DSiXD8G3RyUdZn-wyGE1; LastVisit=2020-07-02T16:38:08.2821857+04:00; _gat=1', 'https://www.cv.ge/announcement/127911/office-housekeeper')
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # C_Link try: c_link = Selector(response=page).xpath( "/html/body/div[2]/div[3]/div[3]/div[2]/div/div/a/@href").get() c_link = "https://staff.am" + c_link except: c_link = "" # Industry try: industry = Selector(response=page).xpath( '/html/body/div[2]/div[3]/div[3]/div[4]/div[1]/div/div/div[1]/div[1]/p[1]/span[2]/text()' ).get() except: industry = "" # Views try: views = Selector(response=page).xpath( '/html/body/div[2]/div[3]/div[3]/div[4]/div[1]/div/div/div[1]/div[1]/p[2]/span/text()' ).get() except: views = "" # Followers try: followers = Selector( response=page).xpath('//*[@id="followers_count"]/text()').get() except: followers = "" # Employment_Term try: employment_term = Selector(response=page).xpath( '//*[@id="job-post"]/div[1]/div[3]/p[1]').get() employment_term = employment_term.split("</span> ") employment_term = remove_tags(employment_term[1]).strip() except: employment_term = "" # Category try: category = Selector(response=page).xpath( '//*[@id="job-post"]/div[1]/div[3]/p[2]').get() category = category.split("</span> ") category = remove_tags(category[1]).strip() except: category = "" # Job_Type try: job_type = Selector(response=page).xpath( '//*[@id="job-post"]/div[1]/div[4]/p[1]').get() job_type = job_type.split("</span> ") job_type = remove_tags(job_type[1]).strip() except: job_type = "" # Deadline try: ends = Selector(response=page).xpath( '//*[@id="job-post"]/div[1]/div[2]/p/text()').get() ends = ends.replace("\n", " ") ends = ends.replace(" Deadline: ", "") ends = ends.split(" ") deadline_day = int(ends[0]) deadline_month = int(months_en[ends[1]]) deadline_year = int(ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Description try: description = Selector( response=page).xpath('//*[@id="job-post"]/div[2]').get() description = remove_tags(description) except: description = "" description_en = "" description_am = "" if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" # Website try: website = Selector(response=page).xpath( '//*[@id="company-contact-details"]/div[1]/p[contains(., "Website")]/a/@href' ).get() website = [website] except: website = [] # Phone Number try: phone = Selector(response=page).xpath( '//*[@id="company-contact-details"]/div[1]/p[contains(., "Phone")]' ).get() phone = phone.split("</span>")[1].split("</p>")[0].strip() if "," in phone: phones = [] phone = phone.split(", ") phone1 = phone[0].replace(") ", "") number1 = phone1.replace("-", "") number1 = number1.replace("(", "") phone1 = {"country_code": "374", "number": number1} phone2 = phone[1].replace(") ", "") number2 = phone2.replace("-", "") number2 = number2.replace("(", "") phone2 = {"country_code": "374", "number": number2} phones.append(phone1) phones.append(phone2) else: number = phone.replace(") ", "") number = number.replace("(", "") number = number.replace("-", "") phones = [{"country_code": "374", "number": number}] except: phones = [] # Address try: address = Selector(response=page).xpath( '//*[@id="company-contact-details"]/div[1]/p[contains(., "Address")]' ).get() address = remove_tags(address) address = address.replace("Address: ", "").strip() # Garegin Hovsepyan 20, Yerevan, Armenia except: address = "" # About Company try: c_description = Selector(response=page).xpath( '//*[@id="company-details"]/div[1]/div[2]').get() c_description = remove_tags(c_description).strip() except: c_description = "" # Canditate Level try: candidate_level = Selector(response=page).xpath( '//*[@id="job-post"]/div[2]/h3[contains(., "candidate level")]/span/text()' ).get() if candidate_level == "Not defined": candidate_level = "" except: candidate_level = "" # Email try: driver.get(link) email = driver.find_element_by_class_name('desc_email').text email = [email] except: email = [] print("Vacancy Scraped Successfully") ccc = Company_Info(c_link) # //*[@id="followers_count"] data = { "c_link": c_link, "industry": industry, "views": views, "followers": followers, "employment_term": employment_term, "category": category, "job_type": job_type, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "description_en": description_en, "description_am": description_am, "website": website, "phone": phones, "address": address, "company_description": c_description, "candidate_level": candidate_level, "email": email, "type_of_company": ccc["type_of_company"], "N_of_employees": ccc["N_of_employees"], "foundation_date": ccc["foundation_date"] } print("Data is ready to be added to a DB") return data # Vacancy('https://staff.am/en/digital-marketing-specialist-219') # https://staff.am/en/apariki-zargacman-bazni-asxatakic # https://staff.am/en/digital-marketing-specialist-219 # //*[@id="job-post"]/div[1]/div[2]/p # //*[@id="job-post"]/div[1]/div[2]/p
def get_products(): for prod in links_db.find({"parsed": False}): req = requests.get(prod["link"]) _print.value(prod["link"]) city = select_one(req, "#df_field_mdebareoba .value", True) bread_crumbs = select_many(req, "#bread_crumbs .point1 li a::text") geonames_id = geo_names(city) deal_type = get_deal_type(bread_crumbs[2], prod['link']) property_type = get_property_type(bread_crumbs[1], prod['link']) status = get_status( select_one(req, "#df_field_built_status .value", True), prod["link"]) street = select_one(req, "#df_field_mdebareoba_level1 .value", True) address = select_one(req, "#df_field_mdebareoba_level2 .value", True) bedrooms = select_one(req, "#df_field_bedrooms .value", True) bathrooms = select_one(req, "#df_field_bathrooms .value", True) total_area = string_to_int( select_one(req, "#df_field_square_feet .value", True))[0] floor = select_one(req, "#df_field_floor .value", True) floors = select_one(req, "#df_field_number_of_floors .value", True) try: _view = int(select_one(req, "#area_listing .count::text")) except: _view = 0 outdoor_features = get_outdoor_features(req) indoor_features = get_indoor_features(req) climate_control = get_climate_control(req) details = [{ "title": select_one(req, "#area_listing > h1"), "house_rules": "", "description": Translate( select_one(req, "#df_field_additional_information .value", True)) }] price = { "price_type": "total_price", "min_price": 0, "max_price": 0, "fix_price": converted_price(select_one(req, "#lm_loan_amount::attr(value)"), prod["link"]), "currency": "USD" } phones = [{ "country_code": 995, "number": converted_price(select_one(req, "#df_field_phone .value a::text"), prod["link"]) }] files = get_images(req) try: real_estate_db.insert_one({ "location": { "country": { "id": "GE" }, "city": { "id": geonames_id, "name": city, "subdivision": "" }, "street": street, "address": address, }, "created_at": datetime.datetime.utcnow(), "deal_type": deal_type, "type_of_property": [property_type], "status": status, "bedrooms": bedrooms, "bathrooms": bathrooms, "total_area": total_area, "metric": "feet_square", "floor": floor, "floors": floors, "car_spaces": 0, "is_agent": True, "outdoor_features": outdoor_features, "indoor_features": indoor_features, "climate_control": climate_control, "detail": details, "price": price, "phones": phones, "files": files, "source": "Home.ge", "view": _view }) links_db.update_one({"link": prod["link"]}, {"$set": { "parsed": True }}) except: log_error(req.url, "პროდუქტის", True)
logo = driver.find_element_by_xpath( '//*[@id="uinfo"]/div[1]/a/img').get_attribute("src") except: logo = "" # Description try: description = driver.find_element_by_xpath( '//*[@id="pcontent"]/div/div[3]').text except: description = "" description_en = "" description_am = "" if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" # Phone try: phone = driver.find_element_by_xpath( '//*[@id="uinfo"]/div[2]/div[2]/a').click() phone = driver.find_element_by_xpath( '//*[@id="callPhoneInfo"]/div[3]/div').text if "\n" not in phone: phone = phone.replace(" ", "")
def all_reply(message): bot.send_message(message.chat.id, Translate(message.text))
'http': 'http://10.10.1.10:3128', 'https': 'http://10.10.1.10:1080', } for i in range(58214, 2000000): url = f"https://www.e-register.am/en/companies/{i}" page = requests.get(url) name_am = Selector( response=page).xpath('//*[@id="page"]/div[1]/text()').get() if name_am is None: print("No company on:", i) continue name_en = Translate(name_am) registration_number = Selector(response=page).xpath( '//*[@id="page"]/table[3]/tr[2]/td[2]/text()').get() foundation_date = registration_number.split("/")[1].strip() registration_number = registration_number.split("/")[0].strip() tax_id = Selector(response=page).xpath( '//*[@id="page"]/table[3]/tr[3]/td[2]/text()').get() z_code = Selector(response=page).xpath( '//*[@id="page"]/table[3]/tr[4]/td[2]/text()').get() data = {
def Vacancy(link, location_id, cookies): print("request sent for Vacancy succesfully") url = link print(url) cookies = { "Cookie": cookies } headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36"} page = requests.get(url, cookies=cookies) #Data soup = BeautifulSoup(page.text, 'html.parser') details = soup.find('div', attrs={"class": "anncmt-details"}) # -------------------------------------------------------------------------------- Requirements ------------------------------------------------------------------------------------------------ # Position try: position = soup.find('div', attrs={"class": "anncmt-title"}).text position = position.lstrip() position = position.rstrip() except: position = "" print("There is no vacancy anymore") print("Position: ", position) # Company try: company = soup.find('div', attrs={"class": "anncmt-customer"}).text company = company.lstrip() company = company.rstrip() except: company = "" print("There is no vacancy anymore") print("Company: ", company) # Dates try: published = str(details).split("<strong>Dates:</strong>") published = published[1].split("-") ends = published[1].split("</td>") ends = ends[0].lstrip() ends = ends.rstrip() ends = ends.split() ends = ends[0]+"/"+months[f"{ends[1]}"] published = published[0].lstrip() published = published.rstrip() published = published.split() published = published[0]+"/"+months[f"{published[1]}"] #Converting verbal month into numeric except: published = "" ends = "" print("Published: ", published) print("Ends: ", ends) # # Location # try: # location = str(details).split("<strong>Location:</strong>") # location = location[1].split("</td>") # location = location[0].lstrip() # location = location.rstrip() # location = location.replace("<span>", "") # location = location.replace("</span>", "") # if "," in location: # location_id = [] # locations = location.split(',') # for location1 in locations: # location1 = location1.lstrip() # location1 = location1.rstrip() # try: # # print(Geonames(location1)) # location_id.append({ "Location" : f"{location1}", "ID" : f"{Geonames(location1)}" } ) # except: # location_id.append( {"Location" : f"{location1}", "ID" : "" } ) # else: # location_id = [ { "Location" : f"{location}", "ID" : f"{Geonames(location)}" } ] # except: # location_id = [{"Location" : "", "ID" : ""}] # print("Location: ", location_id) # Job Type try: jtype = str(details).split("<strong> Employment form:</strong>") jtype = jtype[1].split("</td>") jtype = jtype[0].lstrip() jtype = jtype.rstrip() except: jtype = "" print("Job_Type: ", jtype) # Salary + Bonuses try: salary = str(details).split("<strong> Salary:</strong>") salary = salary[1].split("</td>") salary = salary[0].lstrip() salary = salary.rstrip() if "+" in salary and "-" not in salary: max_salary = salary.split("+")[0].rstrip() min_salary = max_salary max_salary = int(max_salary) bonuses = "Yes" elif "+" in salary and "-" in salary: salary = salary.split("+")[0].rstrip() max_salary = salary.split("-")[1] max_salary = int(max_salary) min_salary = salary.split("-")[0] min_salary = int(min_salary) bonuses = "Yes" elif "+" not in salary and "-" in salary: max_salary = salary.split("-")[1] max_salary = int(max_salary) min_salary = salary.split("-")[0] min_salary = int(min_salary) bonuses = "No" else: min_salary = max_salary = salary bonuses = "No" except: min_salary = 0 max_salary = 0 bonuses = "No" print("Min_Salary: ", min_salary) print("Max_Salary: ", max_salary) print("Bonuses: ", bonuses) # Experience try: experience = str(details).split("<strong> Experience:</strong>") experience = experience[1].split("</td>") experience = experience[0].lstrip() experience = experience.rstrip() except: experience = "" print("Experience: ", experience) # Education try: education = str(details).split("<strong> Education:</strong>") education = education[1].split("</td>") education = education[0].lstrip() education = education.rstrip() except: education = "" print("Education: ", education) # Languages try: languages = str(details).split("<strong> Languages:</strong>") languages = languages[1].split("</span>") languages = languages[0].replace("<span>", "").lstrip() except: languages = "" print("Languages: ", languages) # Driver's License try: dLicense = str(details).split("<strong> Driving licence:</strong>") dLicense = dLicense[1].split("</td>") dLicense = dLicense[0].lstrip() dLicense = dLicense.rstrip() dLicense = dLicense.replace("<span>", "") dLicense = dLicense.replace("</span>", "") except: dLicense = "" print("D_License: ", dLicense) # ------------------------------------------------------------------------- Info ------------------------------------------------------------------------------ # E-mail # try: # email = str(details).split("<strong> ელ. ფოსტა:</strong>") # email = email[1].split("</td>") # email = email[0].lstrip() # raw_email = email.rstrip() # email = re.findall(r'[\w\.-]+@[\w\.-]+', raw_email)[0] # except: # email = "" # print("Email: ", email) # Phone Number try: pNumber = str(details).split("<strong> Phone:</strong>") pNumber = pNumber[1].split("</td>") pNumber = pNumber[0].lstrip() pNumber = pNumber.rstrip() except: pNumber = "" print("Phone_Number: ", pNumber) # Address try: address = str(details).split("<strong> Address:</strong>") address = address[1].split("</td>") address = address[0].lstrip() address = address.rstrip() except: address = "" print("Address: ", address) # Description try: description = soup.find('div', attrs={"class": "firm-descr"}).text description = description.rstrip() description = description.lstrip() except: description = "" print("Description: ", description) if detect(position) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(position) == "et": description_ru = "" description_en = Translate(description) description_ka = description else: description_ru = "" description_en = description description_ka = "" print("----------------------------------------------------------------------------------------------------------------------------------------------------------------------") # Email try: email = re.findall(r'[\w\.-]+@[\w\.-]+', description)[0] except: try: email = str(details).split("<strong> Email:</strong>") email = email[1].split("</td>") email = email[0].lstrip() raw_email = email.rstrip() email = re.findall(r'[\w\.-]+@[\w\.-]+', raw_email)[0] except: email = "" print("Email: ", email) # Web Link try: web_link = re.search("(?P<url>https?://[^\s]+)", description).group("url") except: web_link = "" print("Web_link: ", web_link) data = { "Job_Type": jtype, "Min_Salary": min_salary, "Max_Salary": max_salary, "Bonuses": bonuses, "Experience": experience, "Education": education, "Languages": languages, "Driver_License": dLicense, "Location": location_id, "Address" : address, "Email" : email, "Phone_Number": pNumber, "Web_Link": web_link, "Description_en": description_en, "Description_ru": description_ru, "Description_ka": description_ka } print("returned successfully") return data
class ChatBot: def __init__(self, proxies={}, entropy_filter=True, lang='en', entropy_top=3, query_top=100, fraction=5): self.__ngd = NGD(proxies) #self.__ngd.set_context('site:imsdb.com') self.__cache = {} self.__min_ent = 0.0 self.__entropy_filter = entropy_filter self.__lang = lang self.__entropy_top = entropy_top self.__fraction = fraction self.__query_top = query_top self.__translator = Translate() self.__lock = Lock() self.__voc_translator = None random.seed(666) def set_voc_translator(self, voc_trans=None): self.__voc_translator = voc_trans def entropy_min(self, e_min): self.__min_ent = e_min def reply_to(self, chat_line): self.__lock.acquire() try: chat_line = normalize_token(chat_line) if self.__lang != 'en': chat_line = self.__translator.translate( chat_line, self.__lang, 'en') snippets, answers = [], [] while len(answers) == 0: snippets = self.__ngd.snippets_query( '"%s" site:imsdb.com' % chat_line, self.__query_top) answers = self.__extract_answers(snippets, chat_line) if len(answers) == 0: chat_line = chat_line[:-1] if len(chat_line) == 0: break continue probabilities = self.__build_probs(answers) new_ans = [] for i in range(min(len(answers), self.__fraction)): new_ans.append(self.__choose_random_answer(probabilities)) answers = list(set(new_ans)) new_answers = [] for ans in answers: if self.__entropy_filter: val = self.__ngd.distance( ('"%s"' % chat_line, '"%s"' % ans.encode())) if val: print 'search engine distance (choosing response): %s %f' % ( ans, val) time.sleep(0.25) new_answers.append((ans, val)) if self.__entropy_filter: new_answers.sort(second_compare) #new_answers.reverse() new_answers = map(lambda x: x[0], new_answers[:self.__entropy_top]) answers = filter(lambda x: x in new_answers, answers) ans = None if len(answers) > 0: ans = answers[random.randint(0, len(answers) - 1)] if not ans: ans = 'ah' # use vocabulary translator, if available if self.__voc_translator: ans = self.__voc_translator(ans) if ans and self.__lang != 'en': ans = self.__translator.translate(ans, 'en', self.__lang).lower() if not ans: ans = 'ah' return ans finally: self.__lock.release() # release lock, no matter what def __extract_answer(self, snippet, chat_line): # [^\.!?]+ snippet = normalize_token(snippet) snippet = re.sub('\([^\)]+\) ', '', snippet) snippet = re.sub('\[[^\)]+\] ', '', snippet) iterator = re.finditer('[A-Z][A-Z]+ [^\.!?]+[\.!?]', snippet) lines = [] for match in iterator: line = match.group() #print line line_s = line.split(' ') line = ' '.join(line_s[1:]).lower() line = html2text(line) #print line line = line.replace('_', '').replace('\n', '') #line = re.sub( '\([^\)]+\) ', '', line) if not '-' in line and not ':' in line and not '**' in line and not '(' in line and not ')' in line and not '"' in line: if len(line) > 0 and line[-1] == '.': line = line[:-1] lines.append(line) #ret.append(strip(match)) #print strip(match) if len(lines) == 0: return '' prev = lines[0].lower() ret = [] for i in range(1, len(lines)): if chat_line.lower() in prev: ret.append(lines[i].lower()) prev = lines[i].lower() return ret def __extract_answers(self, snippets, chat_line): ret, ret_titles = [], [] for snippet in snippets: anss = self.__extract_answer(snippet, chat_line) for ans in anss: if ans != '': ret.append(ans.strip()) return ret def __build_probs(self, answers): d = {} for ans in answers: if not ans in d: d[ans] = 1 else: d[ans] += 1 ret = [] for ans, cnt in d.iteritems(): ret.append((ans, float(cnt) / len(answers))) return ret def __choose_random_answer(self, probs): rand_float = random.random() sum = 0.0 ret = None for ans, prob in probs: sum += prob if sum >= rand_float: ret = ans break return ret def start(self): msg = '' while msg != 'bye': msg = raw_input('You: ') ans, title = self.reply_to(msg.strip()) print 'end of chat.' def save_cache(self): self.__ngd.save_cache()
def get_products(): from translator import Translate for prod in links_db.find({"parsed":False, "source": "ss.ge"0}): req = requests.get(prod["link"]) _print.value(prod["link"]) bread_crumbs = select_many(req, ".detailed_page_navlist ul li a::text") city = bread_crumbs[3] geonames_id = geo_names(city) deal_type = get_deal_type(bread_crumbs[2].strip(), prod['link']) property_type = get_property_type(bread_crumbs[1].strip(), prod['link']) status = get_status(select_one(req, "fieldValueStatusId2::text"), prod["link"]) street = select_one(req, ".StreeTaddressList.realestatestr:text").strip() address = street bedrooms = int(select_many(req, ".ParamsHdBlk text::text")[2]) bathrooms = "" total_area = string_to_int(select_many(req, ".ParamsHdBlk text::text")[0])[0] floor = string_to_int(select_many(req, ".ParamsHdBlk text::text")[3]) floors = string_to_int(select_one(req, ".ParamsHdBlk text text span::text")) try: _view = int(select_one(req, "#.article_views span::text")) except: _view = 0 outdoor_features = get_outdoor_features(req) indoor_features = get_indoor_features(req) climate_control = get_climate_control(req) details = [{ "title": select_one(req, "#area_listing > h1"), "house_rules": "", "description": Translate(select_one(req, "#df_field_additional_information .value", True)) }] price ={ "price_type":"total_price", "min_price":0, "max_price":0, "fix_price": converted_price(select_one(req, "#lm_loan_amount::attr(value)"),prod["link"]), "currency": "USD" } phones = [{ "country_code":995, "number": converted_price(select_one(req, "#df_field_phone .value a::text"),prod["link"]) }] files = get_images(req) pprint({ "location": { "country":{ "id":"GE" }, "city": { "id":geonames_id, "name": city, "subdivision": "" }, "street":street, "address": address, }, "created_at": datetime.datetime.utcnow(), "deal_type": deal_type, "type_of_property": [property_type], "status": status, "bedrooms": bedrooms, "bathrooms": bathrooms, "total_area":total_area, "metric":"feet_square", "floor":floor, "floors":floors, "car_spaces":0, "is_agent": True, "outdoor_features":outdoor_features, "indoor_features":indoor_features, "climate_control":climate_control, "detail":details, "price":price, "phones":phones, "files": files, "source": "Home.ge", "view":_view })
def Vacancy_info(url): print(url) page = requests.get(url) # Description try: description = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[4]').get() description = remove_tags(description) description = description.rstrip() description = description.lstrip() description = re.sub(r"\s+", " ", description) print(description) except: description = "" if detect(description) == "ru": description_ru = description description_en = Translate(description) description_ka = "" elif detect(description) == "et": description_ru = "" try: description_en = Translate(description) except: description_en = "" description_ka = description else: description_ru = "" description_en = description description_ka = "" # Email try: email = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[2]/div[2]/div/div/a/@href' ).get() email = email.replace("mailto:", "") except: email = "" # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[1]/div[2]/span/text()' ).get() location_id = [] try: location_id.append({ "city": f"{location}", "id": f"{Geonames(location)}" }) except: location_id.append({"city": f"{location}", "id": "611717"}) except: location_id = [{"city": "Tbilisi", "id": "611717"}] # Category try: category = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[2]/div[2]/span[1]/text()' ).get() except: category = "" # Stack try: stack = Selector(response=page).xpath( '/html/body/div[2]/div/div[1]/div[2]/div[3]/div[2]/div[4]/div[2]/text()' ).get() if "სრული განაკვეთი" in stack: stack = "Full-Stack" except: stack = "" data = { "description_en": description_en, "description_ka": description_ka, "description_ru": description_ru, "email": email, "location": location_id, "category": category, "stack": stack } print("Vacancy Scraped Succesfully") return data
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[3]/a/text()').get( ) location = location.strip() location = location.split(",")[0] location = [{"city": location, "id": Geonames(location)}] except: location = [{"city": "Yerevan", "id": "616052"}] # Website try: website = Selector(response=page).xpath( '/html/body/main/section/div/div[1]/div[3]/ul/li[4]/a/@href').get( ) if website is None: website = [] else: website = [website] except: website = [] # Job Type try: job_type = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[3]/text()').get() job_type = job_type.strip() except: job_type # Published try: published = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[7]/text()').get() published = published.strip() except: published = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[2]/text()').get() salary = salary.strip() salary = salary.replace("֏", "") salary = salary.replace(",", "") salary = salary.replace(" ", "") salary = int(salary) except: salary = 0 # Gender try: gender = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/ul/li[4]/text()[2]').get() gender = gender.strip() except: gender = "" # Description try: description = Selector(response=page).xpath( '/html/body/main/section/div/div[2]/div/p').get() description = remove_tags(description).strip() except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: driver.get(link) email = driver.find_element_by_xpath( '/html/body/main/section/div/div[2]/div/p').text email = re.findall(r'[\w\.-]+@[\w\.-]+', email) except Exception as e: email = [] data = { "location": location, "website": website, "job_type": job_type, "publish_day": published, "salary": salary, "gender": gender, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data # Vacancy("https://www.worknet.am/en/job/%D5%A2%D5%A1%D5%B6%D5%BE%D5%B8%D6%80-%D5%BA%D5%A1%D5%B0%D5%A5%D5%BD%D5%BF%D5%AB-%D5%A1%D5%B7%D5%AD%D5%A1%D5%BF%D5%A1%D5%AF%D5%AB%D6%81-4656")
def Vacancy(link): print("request sent for Vacancy succesfully") url = link # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Location try: location = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Location:")]').get() location = location.split("<td>")[1].split("</td>")[0].replace( "&nbsp", " ") location = location.split(",")[0] location = [{'city': location, 'id': Geonames(location)}] except: location = [{'city': 'Yerevan', 'id': '616052'}] # Company url try: c_url = Selector(response=page).xpath( '/html/body/div[2]/table/tr[contains(., "Company:")]').get() c_url = c_url.split('href="')[1].split('">')[0] except: c_url = "" # Vacancy Description try: description = Selector(response=page).xpath('/html/body/div[4]').get() description = remove_tags(description) description = description.strip() description = description.replace('&nbsp', " ") except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = "" description_am = "" # Email try: email = Selector(response=page).xpath('//*[@id="job"]/a/@href').get() email = email.replace('mailto:', "") email = [email] except: email = [] data = { "location": location, "c_link": c_url, "description_am": description_am, "description_en": description_en, "email": email } # print(data) return data
def Vacancy_Info(link): url = link page = requests.get(url) # Industry try: industry = Selector(response=page).xpath( '/html/body/section/div/div/div/div[2]/div[1]/div/div/div[1]/div[contains(., "Industry:")]' ).get() industry = industry.split('"font-weight-bold">')[1].split('</span>')[0] except: industry = "" # Salary try: salary = Selector(response=page).xpath( '/html/body/section/div/div/div/div[2]/div[1]/div/div/div[1]/div[contains(., "Salary:")]' ).get() salary = salary.split('"font-weight-bold">')[1].split('</span>')[0] salary = salary.replace(",", "") salary = int(salary) if salary is None: salary = 0 except: salary = 0 # Employment type try: employment_type = Selector(response=page).xpath( '/html/body/section/div/div/div/div[2]/div[1]/div/div/div[1]/div[contains(., "Employment type:")]' ).get() employment_type = employment_type.split( '"font-weight-bold">')[1].split('</span>')[0] except: employment_type = "" # Ends try: ends = Selector(response=page).xpath( '/html/body/section/div/div/div/div[2]/div[1]/div/div/div[1]/div[contains(., "Deadline:")]' ).get() ends = ends.split('"font-weight-bold">')[1].split('</span>')[0] ends = ends.split('/') deadline_day = int(ends[0]) deadline_month = int(ends[1]) deadline_year = int(ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Description try: description = Selector(response=page).xpath( '/html/body/section/div/div/div/div[2]/div[1]/div/div/div[2]').get( ) description = remove_tags(description) description = description.strip() except: description = "" if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" # Email try: email = Selector( response=page).xpath('//*[@id="applyEmail"]/text()').get() except: email = [] data = { "industry": industry, "salary": salary, "employment_type": employment_type, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "description_en": description_en, "description_am": description_am, "email": email } # print(data) return data
def Vacancy(link): print("request sent for Vacancy succesfully") url = link print(url) # headers = {"Accept-Language": "en-US,en;q=0.5"} page = requests.get(url) #headers=headers) # Company try: company = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/text()' ).get() except: company = "" # Website try: website = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lnkCompany"]/@href' ).get() website = [website] except: website = [] # Position try: position = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobPostTitle"]/text()' ).get() except: position = "" # logo try: logo = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_imgCompanyLogoLink"]/@src' ).get() logo = "http://jobfinder.am/" + logo except: logo = '' # Job_type try: job_type = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblPositionType"]/text()' ).get() except: job_type = "" # Category try: category = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblCategory"]/text()' ).get() except: category = "" # Experience try: experience = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblExperience"]/text()' ).get() except: experience = "" # Education try: education = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblEducation"]/text()' ).get() except: education = "" # Location try: location = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblLocation"]/text()' ).get() except: location = "" # Published try: published = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() published = published.split(" ") published = published[0].split("-") publish_day = int(published[0]) publish_month = int(published[1]) publish_year = int("20" + published[2]) except: publish_day = 0 publish_month = 0 publish_year = 0 if yesterday_day != publish_day or yesterday_month != publish_month: print("Not published yesterday") return # Ends try: ends = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblDate"]/text()' ).get() ends = ends.split(" ") ends = ends[0].split("-") deadline_day = int(ends[0]) deadline_month = int(ends[1]) deadline_year = int("20" + ends[2]) except: deadline_day = 0 deadline_month = 0 deadline_year = 0 # Salary try: salary = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblSalary"]/text()' ).get() salary = int(salary) except: salary = 0 # Age try: age = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAge"]/text()' ).get() if "--------" in age: age = "" except: age = "" # Gender try: gender = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblGender"]/text()' ).get() if "--------" in gender: gender = "" except: gender = "" # Job Description try: j_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobDescription"]/text()' ).get() except: j_description = "" # Job Responsibilities try: j_responsibilities = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblJobResponsibilities"]/text()' ).get() except: j_responsibilities = "" # Required Qualifications try: r_qualifications = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblRequiredQualifications"]' ).get() r_qualifications = remove_tags(r_qualifications) except: r_qualifications = "" # Application Procedure try: a_procedure = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]' ).get() a_procedure = remove_tags(a_procedure) except: a_procedure = remove_tags(a_procedure) v_description = j_description + "\n" + j_responsibilities + "\n" + r_qualifications + "\n" + a_procedure try: if detect(v_description) == "et": try: v_description_en = Translate(v_description) except: v_description_en = "" v_description_am = v_description else: v_description_en = v_description v_description_am = "" except: v_description_en = "" v_description_am = "" # About Company try: c_description = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblAboutCompany"]' ).get() c_description = remove_tags(c_description) except: c_description = "" try: if detect(c_description) == "et": try: c_description_en = Translate(c_description) except: c_description_en = "" c_description_am = c_description else: c_description_en = c_description c_description_am = "" except: c_description_en = "" c_description_am = "" # Email try: email = Selector(response=page).xpath( '//*[@id="ctl00_bdyPlaceHolde_jfpanelViewJob_jfJobPreview_lblApplicationProcedure"]/a/text()' ).get() email = email.strip() email = [email] except: email = [] # Phone try: phone = re.search(r"\d{9}", v_description_en).group() phone = [{"country_code": "374", "number": phone}] except: phone = [] data = { "company": company, "position": position, "website": website, "logo": logo, "job_type": job_type, "category": category, "experience": experience, "education": education, "location": location, "publish_day": publish_day, "publish_month": publish_month, "publish_year": publish_year, "deadline_day": deadline_day, "deadline_month": deadline_month, "deadline_year": deadline_year, "salary": salary, "age": age, "gender": gender, "v_description_am": v_description_am, "v_description_en": v_description_en, "c_description_am": c_description_am, "c_description_en": c_description_en, "email": email, "phone": phone, } # print(data) return data # Vacancy('http://jobfinder.am/ViewJob.aspx?JobPostingID=49217')
def Company_Info(link): url = link page = requests.get(url) # Industry try: address = Selector(response=page).xpath('/html/body/div[3]/div[1]/div/div/div/div[3]/div/div[1]/p/span[2]/text()').get() except: address = "" try: phone = Selector(response=page).xpath('/html/body/div[3]/div[1]/div/div/div/div[3]/div/div[3]/p/a/text()').get() number = phone.replace("+", "") number = number.replace("374", "") number = number.replace("tel: ", "") phone = [{"country_code" : "374", "number" : number}] except: phone = [] try: website = Selector(response=page).xpath('/html/body/div[3]/div[1]/div/div/div/div[3]/div/div[2]/p/a/@href').get() if "+" in website or "374" in website: phone = website number = phone.replace("+", "") number = number.replace("374", "") number = number.replace("tel: ", "") phone = [{"country_code" : "374", "number" : number}] website = [] elif website is None: website = [] else: website = [website] except: website = [] try: description = Selector(response=page).xpath('/html/body/div[3]/div[2]/div/div/div/div[2]/div/div/p/text()').get() except: description = "" try: if detect(description) == "et": try: description_en = Translate(description) except: description_en = "" description_am = description else: description_en = description description_am = "" except: description_en = description description_am = "" data = { "address" : address, "phone" : phone, "website" : website, "description_am" : description_am, "description_en" : description_en, } # print(data) return data # Company_Info('https://job.am/en/company/18390/san-holding-spe')