def qunar_calendar_parse_data(air_port, file_path): p_dict = {} # 读取数据 with open(file_path, 'r', encoding='utf-8') as f: page = f.read() f.close() # 解析数据 soup = BeautifulSoup(page, 'lxml') try: price_list = soup.find_all('span', class_="price") for iter in range(4, 34): # 前四个数据为8月数据,文件中为“查看”,只解析9月的30个数据 tmp = BeautifulSoup(str(price_list[iter]), 'lxml') date = iter - 3 p_dict[str(date)] = tmp.body.span.span.text qunar_calendar_insert_db(air_port, p_dict) except: spider_log("price_list not find.")
def qunar_calendar_insert_db(air_port, Sep_p_dict): db_name = "qunar_calendar_Seq_ticket.db" db_path = "E:/ticket_spider/database/" + db_name conn = sqlite3.connect(db_path) cu = conn.cursor() try: table_name = re.sub(r"=", '_', air_port) table_name = air_port CREATE_DB = "CREATE TABLE " + table_name + "(crawl_time CHAR(50), " \ "Sep01 INTEGER, Sep02 INTEGER, Sep03 INTEGER, " \ "Sep04 INTEGER, Sep05 INTEGER, Sep06 INTEGER, " \ "Sep07 INTEGER, Sep08 INTEGER, Sep09 INTEGER, " \ "Sep10 INTEGER, Sep11 INTEGER, Sep12 INTEGER, " \ "Sep13 INTEGER, Sep14 INTEGER, Sep15 INTEGER, " \ "Sep16 INTEGER, Sep17 INTEGER, Sep18 INTEGER, " \ "Sep19 INTEGER, Sep20 INTEGER, Sep21 INTEGER, " \ "Sep22 INTEGER, Sep23 INTEGER, Sep24 INTEGER, " \ "Sep25 INTEGER, Sep26 INTEGER, Sep27 INTEGER, " \ "Sep28 INTEGER, Sep29 INTEGER, Sep30 INTEGER)" cu.execute(CREATE_DB) spider_log(air_port + ": db create,table name: " + table_name) except: #spider_log(air_port + ": db exist.") pass crawl_time = current_time() try: INSERT_DB = "insert into " + table_name + "(crawl_time, " \ "Sep01, Sep02, Sep03, Sep04, Sep05, Sep06, Sep07, Sep08, Sep09, Sep10, " \ "Sep11, Sep12, Sep13, Sep14, Sep15, Sep16, Sep17, Sep18, Sep19, Sep20, " \ "Sep21, Sep22, Sep23, Sep24, Sep25, Sep26, Sep27, Sep28, Sep29, Sep30) " \ "values(\'" + crawl_time \ + "\',\'" + Sep_p_dict['1'] + "\',\'" + Sep_p_dict['2'] \ + "\',\'" + Sep_p_dict['3'] + "\',\'" + Sep_p_dict['4'] \ + "\',\'" + Sep_p_dict['5'] + "\',\'" + Sep_p_dict['6'] \ + "\',\'" + Sep_p_dict['7'] + "\',\'" + Sep_p_dict['8'] \ + "\',\'" + Sep_p_dict['9'] + "\',\'" + Sep_p_dict['10'] \ + "\',\'" + Sep_p_dict['11'] + "\',\'" + Sep_p_dict['12'] \ + "\',\'" + Sep_p_dict['13'] + "\',\'" + Sep_p_dict['14'] \ + "\',\'" + Sep_p_dict['15'] + "\',\'" + Sep_p_dict['16'] \ + "\',\'" + Sep_p_dict['17'] + "\',\'" + Sep_p_dict['18'] \ + "\',\'" + Sep_p_dict['19'] + "\',\'" + Sep_p_dict['20'] \ + "\',\'" + Sep_p_dict['21'] + "\',\'" + Sep_p_dict['22'] \ + "\',\'" + Sep_p_dict['23'] + "\',\'" + Sep_p_dict['24'] \ + "\',\'" + Sep_p_dict['25'] + "\',\'" + Sep_p_dict['26'] \ + "\',\'" + Sep_p_dict['27'] + "\',\'" + Sep_p_dict['28'] \ + "\',\'" + Sep_p_dict['29'] + "\',\'" + Sep_p_dict['30'] \ + "\')" cu.execute(INSERT_DB) spider_log(air_port + ": data insert.") except: spider_log(air_port + ": data insert error.") cu.close() conn.commit() conn.close()
def ctrip_calendar_parse_data(air_port, file_path): p_dict = {} # 读取数据 with open(file_path, 'r', encoding='utf-8') as f: page = f.read() f.close() # 解析数据 soup = BeautifulSoup(page, 'lxml') try: price_list = soup.table.tbody.find_all(class_="price") price_num = len(price_list) # 这个参数可以取消 for iter in range(0, price_num): ticket_price = re.findall(r"</dfn>(.+?)</div>", str(price_list[iter])) date = iter + 1 p_dict[str(date)] = ticket_price[0] ctrip_calendar_insert_db(air_port, p_dict) except: spider_log("price_list not find.")
def qunar_calendar_spider(air_port_list): spider_log("qunar_spider start, webdriver: Firefox") driver = webdriver.Firefox() time.sleep(1) main_url = "https://flight.qunar.com/site/oneway_list_inter.htm" depart_time = "searchDepartureTime=2017-09-01" passager_info = "adultNum=1&childNum=0" for air_port_item in air_port_list: spider_log("crawl air_line: " + air_port_item) search_url = main_url + "?" + air_port_item + "?" + depart_time + "?" + passager_info spider_log("crawl url: " + search_url) driver.get(search_url) # 点击低价日历,获取当月(8月)价格情况 driver.find_element_by_xpath(".//*[@id='dateBar']/div[2]/div").click() time.sleep(1) # 原来计划获取90天的情况,现在觉得把问题简化比较好,只获取9月的情况 month_page = driver.page_source # 写入文件 crawl_time = current_time('file_name_hour') # name_structure: site + time_stamp + line_info file_name = "qunar_" + crawl_time + "_" + air_port_item + ".txt" file_path = "E:/ticket_spider/raw_data/" + file_name # 打开文件的时候就要指定编码 with open(file_path, 'w', encoding='utf-8') as f: f.write(month_page) f.close() spider_log(air_port_item + ": download, file_path: " + file_path) time.sleep(1) # 将port信息解析为易读的形式(方便建立数据库) location = re.findall(r'[\u4e00-\u9fa5]+', air_port_item) air_port = location[0] + "_" + location[1] qunar_calendar_parse_data(air_port, file_path) time.sleep(1) driver.quit()
def ctrip_calendar_spider(air_port_list): spider_log("ctrip_spider start, webdriver: Firefox") driver = webdriver.Firefox() time.sleep(1) main_url = "http://flights.ctrip.com" air_line = "international" depart_time = "2017-09-01" position = "y_s" for air_port_item in air_port_list: spider_log("crawl air_line: " + air_port_item) search_url = main_url + "/" + air_line + "/" + air_port_item + "?" + depart_time + "&" + position spider_log("crawl url: " + search_url) driver.get(search_url) time.sleep(1) # 点击低价日历,获取当月(9月)价格情况 driver.find_element_by_xpath( ".//*[@id='calendar_tab']/div[4]/a").click() time.sleep(1) month_page = driver.page_source # 写入文件,准备解析, 这里使用了文件进行解析,也可以不用文件中转。 crawl_time = current_time('file_name_hour') # name_structure: site + time_stamp + line_info file_name = "ctrip_" + crawl_time + "_" + air_port_item + ".txt" file_path = "E:/ticket_spider/raw_data/" + file_name # 打开文件的时候就要指定编码 with open(file_path, 'w', encoding='utf-8') as f: f.write(month_page) f.close() spider_log(air_port_item + ": download, file_path: " + file_path) time.sleep(1) ctrip_calendar_parse_data(air_port_item, file_path) time.sleep(1) driver.quit()
from module.send_email import send_log_email from module.spider_log import spider_log from spider.start_spider import start_spider ''' function: 用于本地运行,设定每1个小时运行一次,每次爬取10条路线,每天爬取5次,爬取一周。 从下午1点开始爬取,进行到下午5点。 ''' if __name__ == '__main__': spider_log("ticket spider local mode start.") try: start_spider() spider_log("---") email_content = "spider run success, ctrip price calendar log." except: email_content = "spider run error." log_path = "E:/ticket_spider/log/ticket_calendar_log.txt" #send_log_email(email_content, log_path)
def start_spider(): ''' --- module spilt --- ''' # 将希望爬取的路线写到该list中 ctrip_air_port_list = [ 'chengdu-losangeles-ctu-lax', # 洛杉矶 'chengdu-newyork-ctu-nyc', # 纽约 'chengdu-tokyo-ctu-tyo', # 东京 'chengdu-seoul-ctu-sel', # 首尔 'chengdu-london-ctu-lon', # 伦敦 'chengdu-sydney-ctu-syd', # 悉尼 'chengdu-paris-ctu-par', # 巴黎 'chengdu-moscow-ctu-mow' # 莫斯科 ] spider_log("ctrip_spider,crawl air_line:") spider_log(str(ctrip_air_port_list)) ctrip_calendar_spider(ctrip_air_port_list) spider_log("ctrip_spider stop.") time.sleep(1) ''' --- module spilt --- ''' # 由于qunar对PhantJS支持不好,同时Firefox运行不稳定,所以先不加入qunar的数据采集 # 将希望爬取的路线写到该list中 qunar_air_port_list = [ 'searchDepartureAirport=成都&searchArrivalAirport=洛杉矶', 'searchDepartureAirport=成都&searchArrivalAirport=纽约', 'searchDepartureAirport=成都&searchArrivalAirport=东京', 'searchDepartureAirport=成都&searchArrivalAirport=首尔', 'searchDepartureAirport=成都&searchArrivalAirport=伦敦', 'searchDepartureAirport=成都&searchArrivalAirport=悉尼', 'searchDepartureAirport=成都&searchArrivalAirport=巴黎', 'searchDepartureAirport=成都&searchArrivalAirport=莫斯科' ] spider_log("qunar_spider, crawl air_line:") spider_log(str(qunar_air_port_list)) qunar_calendar_spider(qunar_air_port_list) spider_log("qunar_spider stop.") time.sleep(1) ''' --- module spilt --- '''
import time from module.send_email import send_email from module.spider_log import spider_log from spider.start_spider import start_spider ''' function: 用于服务器运行,设定每2个小时运行一次,每次爬取10条路线,每天爬取10次,爬取一周。 ''' if __name__ == '__main__': spider_log("ticket spider server mode start.") count = 1 # 首次运行 spider_log("running count: " + str(count)) start_spider() spider_log("---") while(1): # 每隔 2h 运行一次 time.sleep(5) start_spider() spider_log("---") count += 1 spider_log("running count: " + str(count)) if count == 3: break email_content = "ctrip price calendar data."
def send_log_email(email_content=' ', log_path='', send_to_addr='*****@*****.**'): ''' name: 发送邮件模块 func: 调用该模块向指定地址发送邮件,已设定默认正文,默认收件人和默认附件,目前不考虑接受log信息(本地调试) send_email(attch_text='', file_path='', send_to_addr='') ''' # email 地址与用户口令 from_addr = "*****@*****.**" password = "******" # 收件人地址 to_addr = send_to_addr # 默认收件地址,默认抄送一份到该地址 # 构建一个支持附件的邮件容器 msg = MIMEMultipart() # 构造当前时间戳,添加到邮件的主题中 t = time.localtime() time_stamp = str(t.tm_mon) + "." + str(t.tm_mday) + " " + str( t.tm_hour) + ":" + str(t.tm_min) + ":" + str(t.tm_sec) subject = '[' + time_stamp + '] ' + 'Ticket Crawler Run Report' # 填写邮件头信息 msg["Subject"] = subject # 邮件主题 msg["From"] = from_addr msg["To"] = to_addr # 填写邮件正文信息 # 默认正文内容 header_text = ''' 该邮件由 ticket crawler自动生成发送,下方为本次爬取情况: ''' # 希望 attch_text给出:爬虫名,运行时间,爬取数量, mime_text = MIMEText(header_text + email_content, 'plain', 'utf-8') # 实例化一个文本邮件对象 msg.attach(mime_text) # 添加邮件附件信息: log with open(log_path, 'rb') as f: # 设置附件的MIME和文件名 dir_path = os.path.dirname(log_path) file_name = log_path[len(dir_path):] # 获取文件名 mime = MIMEBase('log', 'txt', filename=file_name) # 加上必要的头部信息 mime.add_header('Content-Disposition', 'attachment', filename=file_name) mime.add_header('Content-ID', '<0>') mime.add_header('X-Attachment-Id', '0') # 读取附件内容 mime.set_payload(f.read()) # 用Base64编码 encoders.encode_base64(mime) # 添加到MIMEMultipart msg.attach(mime) f.close() # 发送邮件 try: server = smtplib.SMTP_SSL("smtp.qq.com", 465) # server.set_debuglevel(1) # 输出所有交互信息 server.login(from_addr, password) server.sendmail(from_addr, to_addr, msg.as_string()) server.quit() spider_log("mail send to : " + to_addr) except: spider_log("mail send falied.")