def crawl_data(self, url): r = requests.get(url, headers=headers) m = movie() self.current_mid = m.id = url.split('/')[-2] # 电影名 pattern = re.compile('<span property="v:itemreviewed">(.*)</span>') m.title = re.findall(pattern, r.text) while m.title == []: # 电影名为空说明没有获取到页面 print('Can NOT Reach {}, Sending Request Again'.format(url)) time.sleep(10) r = requests.get(url, headers=headers) m.title = re.findall(pattern, r.text) m.title = m.title[0] # 年份 pattern = re.compile('<span class="year">\((\d*)\)</span>') m.year = re.findall(pattern, r.text)[0] # 导演 pattern = re.compile('rel="v:directedBy">([^<]*)') m.director = re.findall(pattern, r.text) # 编剧 pattern = re.compile('<a href="/celebrity/\d*/">([^<]*)</a>') m.writer = re.findall(pattern, r.text) # 主演 pattern = re.compile('rel="v:starring">([^<]*)</a>') m.starring = re.findall(pattern, r.text) # 类型 pattern = re.compile('<span property="v:genre">([^<]*)</span>') m.genre = re.findall(pattern, r.text) # 国家 pattern = re.compile('制片国家/地区:</span>([^<]*)') m.country = re.findall(pattern, r.text) # 语言 pattern = re.compile('语言:</span>([^<]*)') m.language = re.findall(pattern, r.text) # 长度 if m.id == '3734350': # 两个特殊的页面,结构特殊,下面正则表达式获取不到片长 m.length = 6 elif m.id == '6146955': m.length = 81 else: pattern = re.compile('property="v:runtime" content="(\d+)') m.length = re.findall(pattern, r.text)[0] #评分 pattern = re.compile('property="v:average">([\d\.]+)</strong>') m.rating = re.findall(pattern, r.text)[0] # m.print_info() SQL = m.generate_sql() flag = True for sql in SQL: if self.execute_sql(sql) == False: flag = False if flag: print('Process-{} Successfully Saved {}'.format(self.id, m.title))
def getMovieByname(self, name): self.c.execute('SELECT * FROM posts WHERE name=?', (name, )) rsp = self.c.fetchone() if rsp: mv = movie(rsp['author'], rsp['poster'], rsp['review'], rsp['tlink'], rsp['trailer'], rsp['dd_link'], rsp['name'], self.getGenre(rsp), idx=rsp['idx'], time=rsp['time']) return mv else: return None
def parseFragileDesires(url): page = requests.get(url) tree = html.fromstring(page.content) movies = [] # //*[@class="attachment-home-image size-home-image wp-post-image"] videos = tree.xpath( '//*[@class="attachment-home-image size-home-image wp-post-image"]') # print(videos) for eachVideo in videos: vid = movie() # url of MainPic urlOfMainPic = eachVideo.get("src") vid.mainImage = urlOfMainPic # print(urlOfMainPic) vid.site = "http://www.FragileDesires.com" # url to MainVid linkToVideoPage = eachVideo.getparent().get("onclick").split("'")[1] # print(linkToVideoPage) if ("member" in linkToVideoPage) or ("archives" in linkToVideoPage): # Site specific ID siteSpecificID = urlOfMainPic.split("/")[-1][:-4] vid.movieID = siteSpecificID # print(siteSpecificID) pageOfEachVid = requests.get(linkToVideoPage) treeOfEachVid = html.fromstring(pageOfEachVid.content) title = treeOfEachVid.xpath( '//*[@class="single_title"]')[0].text[6:] # print(title) vid.title = title # Adding Nana as the only Actress vid.actors.append("Nana") print(vid.createAddMovieUrl()) requests.get(vid.createAddMovieUrl()) else: #not a real video...like the 1st thing which is a diary entry pass
def getMovieByID(self, idx): self.c.execute('SELECT * FROM posts WHERE idx=?', (idx, )) rsp = self.c.fetchone() if rsp: mv = movie(rsp['author'], rsp['poster'], rsp['review'], rsp['tlink'], rsp['trailer'], rsp['dd_link'], rsp['name'], self.getGenre(rsp), LikedCount=rsp['LikedCount'], idx=rsp['idx'], time=rsp['time']) return mv else: return None
def parseHardtiedPage(url): page = requests.get(url) tree = html.fromstring(page.content) # This will create a list of Videos on that page: movies = [] videos = tree.xpath('//table[@class="articleWrapper"]') for eachVideo in videos: vid = movie() siteURL = "http://www.hardtied.com" # lets find a perMovie url postUrl = eachVideo.xpath( '*//span[@class="articleTitleText"]/a')[0].get("href") uniqueVidUrl = siteURL + postUrl # print(uniqueVidUrl) uniquePage = requests.get(uniqueVidUrl) uniquePageTree = html.fromstring(uniquePage.content) mainPartOfPage = uniquePageTree.xpath( '//table[@class="articleWrapper"]')[0] # Find Title and Actress(es) for x in mainPartOfPage.xpath('*//span[@class="articleTitleText"]'): # print(x.text_content()) innerTextSplit = x.text_content().split("|") vid.title = innerTextSplit[0].strip() # print(innerTextSplit) for eachActor in innerTextSplit[1:]: vid.actors.append(eachActor.strip()) # Find Date of Video uglyDate = mainPartOfPage.xpath( '*//span[@class="articlePostDateText"]')[0].text_content() vid.date = parse(uglyDate) # Find Summary: for x in mainPartOfPage.xpath('*//*[@class="articleCopyText"]'): # print(x.text_content()) vid.summary = x.text_content() # Find Tags: try: tagBlock = uniquePageTree.xpath( '//*[@class="articleTagsText"]')[0].text strippedBeginning = tagBlock[5:] seperatedByCommas = strippedBeginning.replace("\t", "").replace( "\n", "").split(", ") AllTags = [] for eachTag in seperatedByCommas: AllTags.append(eachTag.strip()) # print("block: ",AllTags) vid.tags = AllTags except: pass # Find All Images: allPhotos = mainPartOfPage.xpath('*//img') # print(allPhotos) for x in allPhotos: url = x.get("src") if "poster.jpg" in url: # print(url) vid.mainImage = url else: fullSizeImage = x.getparent().get("href") if "images" in fullSizeImage: vid.images.append(fullSizeImage) vid.site = siteURL movies.append(vid) return movies
def get_data(self, url): header = get_header() try: r = requests.get(url, headers=header) except: print("Connection Refused. We Ary Trying Again.") time.sleep(10) header = get_header() r = requests.get(url, headers=header) context = r.text m = movie() self.current_movie_id = m.id = int(url[33:-1]) # title pattern = re.compile('<span property="v:itemreviewed">(.*)</span>') m.title = re.findall(pattern, context) while m.title == []: print("Can NOT Reach {}, Sending Request Again.".format(url)) time.sleep(10) r = requests.get(url, headers=header) context = r.text m.title = re.findall(pattern, context) m.title = m.title[0] # year pattern = re.compile('<span class="year">\((\d*)\)</span>') m.year = re.findall(pattern, context)[0] # director pattern = re.compile('rel="v:directedBy">([^<]*)') m.director = re.findall(pattern, context) # writer pattern = re.compile('<a href="/celebrity/\d*/">([^<]*)</a>') m.writer = re.findall(pattern, context) # actor pattern = re.compile('rel="v:starring">([^<]*)</a>') m.actor = re.findall(pattern, context) # type pattern = re.compile('<span property="v:genre">([^<]*)</span>') m.type = re.findall(pattern, context) # country pattern = re.compile('<span class="pl">制片国家/地区:</span>([^<]*)') m.country = re.findall(pattern, context) # language pattern = re.compile('<span class="pl">语言:</span>([^<]*)') m.language = re.findall(pattern, context) # length if m.id == 3734350: m.length = 6 elif m.id == 6146955: m.length = 81 else: pattern = re.compile('<span property="v:runtime" content="(\d+)') m.length = re.findall(pattern, context)[0] # score pattern = re.compile('property="v:average">([\d\.]+)</strong>') m.score = re.findall(pattern, context)[0] self.ids.append(m.id) self.titles.append(m.title) self.years.append(m.year) self.directors.append(m.director) self.writers.append(m.writer) self.actors.append(m.actor) self.types.append(m.type) self.countries.append(m.country) self.languages.append(m.language) self.lengths.append(m.length) self.scores.append(m.score)
conn.close() if __name__ == '__main__': DataBase.CreateDb() db = DataBase() m = movie(name='name', author='author', time='time', tlink='tlink', poster='poster', genre={ 'r_doc': 'r_doc', 'r_sci': 'r_sci', 'r_mys': 'r_mys', 'r_thr': 'r_thr', 'r_act': 'r_act', 'r_phi': 'r_phi', 'r_com': 'r_com', 'r_min': 'r_min', 'r_exp': 'r_exp' }, review='review', trailer='trailer', dd_link='dd_link') db.AddPost(m) db.AddPost(m) db.AddPost(m) print(db.getMovieByname('name')) print(db.getMovieByID(1)) db.close()