def LoginGetValue(root): #login from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.PhantomJS() driver.get("https://secure.lme.com/Data/Community/Login.aspx") driver.find_element_by_id('_logIn__userID').send_keys("username") driver.find_element_by_id('_logIn__password').send_keys("password") driver.find_element_by_id('_logIn__logIn').click() #enter the page driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click() date = driver.find_element_by_xpath( "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text Copper = driver.find_element_by_xpath( "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text Aluminium = driver.find_element_by_xpath( "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text Nickel = driver.find_element_by_xpath( "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text Zinc = driver.find_element_by_xpath( "//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text #print date #print Copper, Aluminium, Nickel, Zinc date1 = date.encode("utf-8") dateConvert = ("%s-%s-%s" % (date1[11:], date1[8:10], date1[5:7])) #print dateConvert driver.quit() LMELogin = (dateConvert, Copper.encode('utf-8'), Aluminium.encode('utf-8'), Nickel.encode('utf-8'), Zinc.encode('utf-8')) return LMELogin
def LoginGetValue(root): #login from selenium import webdriver from selenium.webdriver.common.keys import Keys driver = webdriver.PhantomJS() driver.get("https://secure.lme.com/Data/Community/Login.aspx") driver.find_element_by_id('_logIn__userID').send_keys("username") driver.find_element_by_id('_logIn__password').send_keys("password") driver.find_element_by_id('_logIn__logIn').click() #enter the page driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click() date = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text Copper = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text Aluminium = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text Nickel = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text Zinc = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text #print date #print Copper, Aluminium, Nickel, Zinc date1 = date.encode("utf-8") dateConvert = ("%s-%s-%s"%(date1[11:], date1[8:10], date1[5:7])) #print dateConvert driver.quit() LMELogin = (dateConvert, Copper.encode('utf-8'), Aluminium.encode('utf-8') ,Nickel.encode('utf-8'), Zinc.encode('utf-8')) return LMELogin
def getNews(url): try: res = requests.get(url) soup = BeautifulSoup(res.text,"lxml") title = soup.title.string #print "Title: "+title kinds = soup.find(attrs={"name":"section"})['content'] #print "Kinds: "+kinds date = soup.find("time").text.strip() date = date.replace(u"年","-") date = date.replace(u"月","-") date = date.replace(u"日","") if kinds != u"寵物動物": if date.find(':')!=-1: date+=":00" else: date+=" 00:00:00" date = date.encode('utf-8') date = datetime.strptime(date,"%Y-%m-%d %H:%M:%S")
def download_annonce_leboncoin(id): global poubelle print "Download annonce %d" % id appart_url = "http://www.leboncoin.fr/locations/%d.htm" % id request = urllib2.Request(appart_url, headers=headers) response = urllib2.urlopen(request) the_page = response.read() pool = BeautifulSoup(the_page) upload_by = pool.find("div", {"class": "upload_by"}) auteur = upload_by.find("a").string if ( auteur == " ancea " or auteur == " bonapart immobilier " or auteur == " allo location " or auteur == " casa immo " or "hestia" in auteur.lower() ): poubelle += 1 return date = unicode(upload_by.contents[2].string).strip()[:-1] titre = pool.find("div", {"class": "header_adview"}).find("h2").string try: date = datetime.fromtimestamp( time.mktime(time.strptime("2013 " + date.encode("utf-8"), u"%Y le %d %B à %H:%M".encode("utf-8"))) ) except AttributeError: date = datetime.now() params = pool.find("div", {"class": "lbcParams"}) loyer = int(re.sub(r"[^\d-]+", "", params.find("span", {"class": "price"}).string[:-2])) ville = params.find("th", text=re.compile("Ville")).parent.td.string cp = int(re.sub(r"[^\d-]+", "", params.find("th", text=re.compile("Code postal")).parent.td.string)) try: # piece n'est pas un param obligatoire pieces_tag = params.find("th", text=re.compile(r"Pi.ces")) if pieces_tag: pieces = pieces_tag.parent.td.string else: pieces = None # meublé/non meublé n'est pas un param obligatoire meuble_tag = params.find("th", text=re.compile(r"Meubl.")) if meuble_tag: meuble = unicode(meuble_tag.parent.td.string.strip()) == u"Meublé" else: meuble = None # la surface n'est pas un param obligatoire surface_tag = params.find("th", text=re.compile("Surface")) if surface_tag: surface = int(re.sub(r"[^\d-]+", "", surface_tag.parent.td.contents[0])) else: surface = None except AttributeError: print "Scraping problem" description = unicode(pool.find("div", {"class": "content"})) # cette méthode choppe les photos dans le code du carrousel (dispo quand ya plusieurs photos) photos = re.findall(r"aImages\[\d\] = \"(http://.*)\";", the_page) if not photos: # si 0 ou 1 photo alors pas de carrousel, essayons autrement image_tag = pool.find("a", {"id": "image"}) if image_tag: # ya 1 photo photos = re.findall(r"(http://.*\.jpg)", image_tag["style"]) for photo in photos: photo_jobs.add(download_photo, (photo, appart_url)) appart = Appartement( id, titre, loyer, ville, cp, pieces, meuble, surface, description, photos, date, auteur, "leboncoin", appart_url ) try: Session.add(appart) Session.commit() except IntegrityError: print "Got integrity error while trying to add %d %s" % (id, appart) time.sleep(1)
def download_annonce_leboncoin(id): global poubelle print "Download annonce %d" % id appart_url = "http://www.leboncoin.fr/locations/%d.htm" % id request = urllib2.Request( appart_url, headers=headers) response = urllib2.urlopen(request) the_page = response.read() pool = BeautifulSoup(the_page) upload_by = pool.find("div", {"class": "upload_by"}) auteur = upload_by.find("a").string if auteur == " ancea " or auteur == " bonapart immobilier " or auteur == " allo location "\ or auteur == " casa immo " or "hestia" in auteur.lower(): poubelle += 1 return date = unicode(upload_by.contents[2].string).strip()[:-1] titre = pool.find("div", {"class": "header_adview"}).find("h2").string try: date = datetime.fromtimestamp( time.mktime(time.strptime("2013 " + date.encode("utf-8"), u"%Y le %d %B à %H:%M".encode("utf-8")))) except AttributeError: date = datetime.now() params = pool.find("div", {"class": "lbcParams"}) loyer = int(re.sub(r'[^\d-]+', '', params.find("span", {"class": "price"}).string[:-2])) ville = params.find("th", text=re.compile("Ville")).parent.td.string cp = int(re.sub(r'[^\d-]+', '', params.find("th", text=re.compile("Code postal")).parent.td.string)) try: #piece n'est pas un param obligatoire pieces_tag = params.find("th", text=re.compile(r"Pi.ces")) if pieces_tag: pieces = pieces_tag.parent.td.string else: pieces = None #meublé/non meublé n'est pas un param obligatoire meuble_tag = params.find("th", text=re.compile(r"Meubl.")) if meuble_tag: meuble = (unicode(meuble_tag.parent.td.string.strip()) == u"Meublé") else: meuble = None #la surface n'est pas un param obligatoire surface_tag = params.find("th", text=re.compile("Surface")) if surface_tag: surface = int(re.sub(r'[^\d-]+', '', surface_tag.parent.td.contents[0])) else: surface = None except AttributeError: print "Scraping problem" description = unicode(pool.find("div", {"class": "content"})) # cette méthode choppe les photos dans le code du carrousel (dispo quand ya plusieurs photos) photos = re.findall(r"aImages\[\d\] = \"(http://.*)\";", the_page) if not photos: # si 0 ou 1 photo alors pas de carrousel, essayons autrement image_tag = pool.find("a", {"id": "image"}) if image_tag: # ya 1 photo photos = re.findall(r"(http://.*\.jpg)", image_tag["style"]) for photo in photos: photo_jobs.add(download_photo, (photo, appart_url)) appart = Appartement(id, titre, loyer, ville, cp, pieces, meuble, surface, description, photos, date, auteur, "leboncoin", appart_url) try: Session.add(appart) Session.commit() except IntegrityError: print "Got integrity error while trying to add %d %s" % (id, appart) time.sleep(1)
def extract_champs_xml(filename): posts=[] print " - fichier de posts: \""+filename+"\"..." file=open(filename,"r") line = file.readline() newpost=0 while "</posts>" not in line: if (not (len(posts))%50) and (len(posts)>0): print " [#"+str(len(posts)) +"]" if newpost==0: sline = line.split() if len(sline)>0: if (sline[0]=="<post>"): newpost=1 else: line = file.readline() if newpost==1: current = file.readline() while not "date" in current: current = file.readline() date=current[8:16] date = date[:4] + '-' + date[4:6] + '-' + date[6:] + ' ' file.readline() current = file.readline() categ1 = current[10:-8] current = file.readline() categ2 = current[10:-8] current = file.readline() categ3 = current[10:-8] file.readline() file.readline() file.readline() file.readline() current= file.readline() permalink = unicode(current[13:-13],'utf-8') file.readline() current= file.readline() website = unicode(current[8:-8],'utf-8') finished=0 content='' title = '' while finished == 0: contentline=file.readline() scontentline=contentline.split('>') if len(scontentline)>0: if scontentline[0]==" <title": finished=1 u=unicode(contentline[9:-9],'utf-8') title = u.encode("latin-1") content = title + ' . ' + content.encode('utf-8') # print "title "+ title if finished==0: content+=unicode(contentline,'utf-8') content = content.decode('utf-8','replace') #title = title.decode('utf-8','replace') content=content.replace("\n"," ").replace("\r"," ").replace("<b>content: </b> ","").replace(" <div>","") print "title " + title if verbose>1: print " - content:",content contentclean=specialutf8(cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," "))) #title= specialutf8(title) if verbose>1: print " - clean content:",contentclean contentanchor=detecthref(content) if verbose>1: print " - anchor content:",contentanchor # title =title.encode('utf-8','replace') content =content.encode('utf-8','replace') contentclean =contentclean.encode('utf-8','replace') date=date.encode('utf-8','replace') permalink = permalink.encode('utf-8','replace') categ1=categ1.encode('utf-8','replace') categ2=categ2.encode('utf-8','replace') categ3=categ3.encode('utf-8','replace') contentanchor =contentanchor.encode('utf-8','replace') #posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut newpost = 0 line = current else: line = file.readline() file.close() print "---",len(posts),"posts processed." # print " - such as: ",posts[0] return posts
def extract_champs_html(filename): posts=[] print " - fichier de posts: \""+filename+"\"..." # file = open(filename,"r") file=codecs.open(filename,"r","utf8") line = file.readline() newpost=0 while line != "": if (not (len(posts))%50) and (len(posts)>0): print " [#"+str(len(posts)) +"]" if newpost==0: sline = line.split() if len(sline)>0: if (sline[0]=="<div><b>title:"): newpost=1 if newpost==1: #print "--- new post" title=file.readline()[12:-14] #print title.encode('utf-8','replace') if verbose>0: print " - post title:",title current = file.readline() while not "date" in current: current = file.readline() date=file.readline()[12:-14] #print " - date:",date file.readline() file.readline() #lf : IGNORED file.readline() file.readline() #accuracy : IGNORED file.readline() permalink=file.readline()[12:-14] if verbose>2: print " - permalink:",permalink file.readline() website=file.readline()[12:-14] if verbose>2: print " - website:",website file.readline() author=file.readline()[12:-14] if verbose>2: print " - author:",author file.readline() file.readline() #query: IGNORED cur_ligne = file.readline() while not "categori" in cur_ligne: cur_ligne = file.readline() category=file.readline()[12:-14] if verbose>2: print " - category:",category categs = category.split('>') #now is the content itself finished=0 content="" while finished == 0: contentline=file.readline() if contentline=="": finished=1 line="" scontentline=contentline.split() if len(scontentline)>0: if scontentline[0]=="<div><b>title:": finished=1 if finished==0: content+=contentline content=content.replace("\n"," ").replace("\r"," ").replace("<b>content: </b> ","").replace(" <div>","") categ1 = categs[0].replace(' ','') categ2 = categs[1].replace(' ','') categ3 = categs[2].replace(' ','') if verbose>1: print " - content:",content contentclean=specialutf8(cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," "))) title= specialutf8(title) if verbose>1: print " - clean content:",contentclean contentanchor=detecthref(content) if verbose>1: print " - anchor content:",contentanchor title =title.encode('utf-8','replace') content =content.encode('utf-8','replace') contentclean =contentclean.encode('utf-8','replace') date=date.encode('utf-8','replace') permalink = permalink.encode('utf-8','replace') categ1=categ1.encode('utf-8','replace') categ2=categ2.encode('utf-8','replace') categ3=categ3.encode('utf-8','replace') contentanchor =contentanchor.encode('utf-8','replace') #posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut newpost = 0 else: line = file.readline() file.close() print "---",len(posts),"posts processed." # print " - such as: ",posts[0] return posts
def extract_champs_txt(filename): posts=[] print " - post file \""+filename+"\"..." file=codecs.open(filename,"r","utf8") line = " " newpost=0 while not line =='': if not (len(posts))%50: print " [#"+str(len(posts)) +"]" if newpost==0: line = file.readline() sline = line.split() if len(sline)>0: if (sline[0]=="title:"): newpost=1 if newpost==1: title=line[7:-1] if verbose>0: print " - post title:",title cur_ligne = '' while not "date" in cur_ligne: cur_ligne = file.readline() date=cur_ligne[6:-1] if verbose>0: print " - date:",date #file.readline() file.readline() #lf : IGNORED #file.readline() file.readline() #accuracy : IGNORED #file.readline() permalink=file.readline()[11:-1] if verbose>2: print " - permalink:",permalink #file.readline() website=file.readline()[9:-1] if verbose>2: print " - website:",website #file.readline() author=file.readline()[8:-1] if verbose>2: print " - author:",author #file.readline() file.readline() #query: IGNORED #file.readline() cur_ligne = '' while not "categorie" in cur_ligne: cur_ligne = file.readline() #print cur_ligne category=cur_ligne[12:] if verbose>2: print " - category:",category categs = category.split('>') #now is the content itself finished=0 content="" while finished == 0: contentline=file.readline() if"========" in contentline: finished=1 scontentline=contentline.split() if len(scontentline)>0: if scontentline[0]=="title:": finished=1 if finished==0: content+=contentline content=content.replace("\n"," ").replace("\r"," ").replace("content:","") categ1 = categs[0].replace(' ','') categ2 = categs[1].replace(' ','') categ3 = categs[2].replace(' ','') if verbose>1: print " - content:",content contentclean=cleancontent(content.replace("</span>"," ").replace("</hr>"," ").replace("</li>"," ").replace("</a>"," ").replace("</br>"," ").replace("</div>"," ").replace("</p>"," ").replace("<hr />"," ").replace("</h1>"," ").replace("</h3>"," ").replace("</h4>"," ").replace("</h5>"," ").replace("</img>"," ")) if verbose>1: print " - clean content:",contentclean contentanchor=detecthref(content) if verbose>1: print " - anchor content:",contentanchor contentclean =contentclean.encode('utf-8','replace') title =title.encode('utf-8','replace') content =content.encode('utf-8','replace') date=date.encode('utf-8','replace') permalink = permalink.encode('utf-8','replace') categ1=categ1.encode('utf-8','replace') categ2=categ2.encode('utf-8','replace') categ3=categ3.encode('utf-8','replace') contentanchor =contentanchor.encode('utf-8','replace') #posts.append([title,date,permalink,website,categ1,categ2,categ3,content,contentclean,contentanchor])#avec le html brut posts.append([title,date,permalink,website,categ1,categ2,categ3,contentclean,contentanchor])#sans le html brut newpost = 0 file.close() print "---",len(posts),"posts processed." # print " - such as: ",posts[0] return posts
def is_date_less_last_date(date, days_to_last_date): return datetime.strptime(date.encode('utf-8'), '%d %b %y').date() < get_last_date_to_parse(days_to_last_date)
def is_date_less_last_date(date, days_to_last_date): return datetime.strptime( date.encode('utf-8'), '%d %b %y').date() < get_last_date_to_parse(days_to_last_date)
soup3 = BeautifulSoup(res3.text) da = soup3.find('table', {"class": "subpage_data spFirst"}) da2 = da.findAll('tr') date = 'null' for i in range(0, len(da2)): da3 = da2[i].findAll('td') if (da3[0].text == 'USA' and da3[2].text == ''): date = da3[1].text break if (date == 'null'): for i in range(0, len(da2)): da3 = da2[i].findAll('td') if (da3[0].text == 'USA' and da3[2].text == ' (limited)'): date = da3[1].text break data = date.encode('utf-8') #-------------------price res2 = requests.get(line + 'business?ref_=tt_dt_bus') soup2 = BeautifulSoup(res2.text) pr = soup2.find('div', {"id": "tn15content"}) pr2 = pr.text.replace('\n', ' ').split(' ') cr = 0 j = 0 price = [] pdate = [] punit = [] cc = pr.text.split('\n') for i in range(0, len(cc)): if (cc[i] == 'Weekend Gross'): cr = 1
#LME Price driver.get("https://secure.lme.com/Data/Community/Login.aspx") driver.find_element_by_id('_logIn__userID').send_keys("USERNAME") driver.find_element_by_id('_logIn__password').send_keys("PSWORD") driver.find_element_by_id('_logIn__logIn').click() #enter the page driver.find_element_by_id('_subMenu__dailyStocksPricesMetals').click() date = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[6]/td[1]").text Copper = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[8]").text Aluminium = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[6]").text Nickel = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[12]").text Zinc = driver.find_element_by_xpath("//*[@id='Table3']/tbody/tr[5]/td/table/tbody/tr[7]/td[16]").text date1 = date.encode("utf-8") dateConvert = ("%s-%s-%s"%(date1[11:], date1[8:10], date1[5:7])) driver.quit() LMEArr=[Copper.encode('utf-8'), Aluminium.encode('utf-8') ,Nickel.encode('utf-8'), Zinc.encode('utf-8')] #Currency CResultArr=[] user = '******' password = "******" url = "http://www.ctci.com.tw/Acc_Rep/rate/rate.asp"