def crawl_data(self, url):
        r = requests.get(url, headers=headers)
        m = movie()
        self.current_mid = m.id = url.split('/')[-2]
        # 电影名
        pattern = re.compile('<span property="v:itemreviewed">(.*)</span>')
        m.title = re.findall(pattern, r.text)
        while m.title == []:    # 电影名为空说明没有获取到页面
            print('Can NOT Reach {}, Sending Request Again'.format(url))
            time.sleep(10)
            r = requests.get(url, headers=headers)
            m.title = re.findall(pattern, r.text)
        m.title = m.title[0]
        # 年份
        pattern = re.compile('<span class="year">\((\d*)\)</span>')
        m.year = re.findall(pattern, r.text)[0]
        # 导演
        pattern = re.compile('rel="v:directedBy">([^<]*)')
        m.director = re.findall(pattern, r.text)
        # 编剧
        pattern = re.compile('<a href="/celebrity/\d*/">([^<]*)</a>')
        m.writer = re.findall(pattern, r.text)
        # 主演
        pattern = re.compile('rel="v:starring">([^<]*)</a>')
        m.starring = re.findall(pattern, r.text)
        # 类型
        pattern = re.compile('<span property="v:genre">([^<]*)</span>')
        m.genre = re.findall(pattern, r.text)
        # 国家
        pattern = re.compile('制片国家/地区:</span>([^<]*)')
        m.country = re.findall(pattern, r.text)
        # 语言
        pattern = re.compile('语言:</span>([^<]*)')
        m.language = re.findall(pattern, r.text)
        # 长度
        if m.id == '3734350':   # 两个特殊的页面,结构特殊,下面正则表达式获取不到片长
            m.length = 6
        elif m.id == '6146955':
            m.length = 81
        else:
            pattern = re.compile('property="v:runtime" content="(\d+)')
            m.length = re.findall(pattern, r.text)[0]
        #评分
        pattern = re.compile('property="v:average">([\d\.]+)</strong>')
        m.rating = re.findall(pattern, r.text)[0]

        # m.print_info()

        SQL = m.generate_sql()
        flag = True
        for sql in SQL:
            if self.execute_sql(sql) == False:
                flag = False
        if flag:
            print('Process-{} Successfully Saved {}'.format(self.id, m.title))
Exemplo n.º 2
0
 def getMovieByname(self, name):
     self.c.execute('SELECT * FROM posts WHERE name=?', (name, ))
     rsp = self.c.fetchone()
     if rsp:
         mv = movie(rsp['author'],
                    rsp['poster'],
                    rsp['review'],
                    rsp['tlink'],
                    rsp['trailer'],
                    rsp['dd_link'],
                    rsp['name'],
                    self.getGenre(rsp),
                    idx=rsp['idx'],
                    time=rsp['time'])
         return mv
     else:
         return None
Exemplo n.º 3
0
def parseFragileDesires(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)

    movies = []
    # //*[@class="attachment-home-image size-home-image wp-post-image"]
    videos = tree.xpath(
        '//*[@class="attachment-home-image size-home-image wp-post-image"]')
    # print(videos)
    for eachVideo in videos:
        vid = movie()

        # url of MainPic
        urlOfMainPic = eachVideo.get("src")
        vid.mainImage = urlOfMainPic
        # print(urlOfMainPic)

        vid.site = "http://www.FragileDesires.com"

        # url to MainVid
        linkToVideoPage = eachVideo.getparent().get("onclick").split("'")[1]
        # print(linkToVideoPage)

        if ("member" in linkToVideoPage) or ("archives" in linkToVideoPage):
            # Site specific ID
            siteSpecificID = urlOfMainPic.split("/")[-1][:-4]
            vid.movieID = siteSpecificID
            # print(siteSpecificID)

            pageOfEachVid = requests.get(linkToVideoPage)
            treeOfEachVid = html.fromstring(pageOfEachVid.content)

            title = treeOfEachVid.xpath(
                '//*[@class="single_title"]')[0].text[6:]
            # print(title)
            vid.title = title

            # Adding Nana as the only Actress
            vid.actors.append("Nana")

            print(vid.createAddMovieUrl())
            requests.get(vid.createAddMovieUrl())

        else:  #not a real video...like the 1st thing which is a diary entry
            pass
Exemplo n.º 4
0
 def getMovieByID(self, idx):
     self.c.execute('SELECT * FROM posts WHERE idx=?', (idx, ))
     rsp = self.c.fetchone()
     if rsp:
         mv = movie(rsp['author'],
                    rsp['poster'],
                    rsp['review'],
                    rsp['tlink'],
                    rsp['trailer'],
                    rsp['dd_link'],
                    rsp['name'],
                    self.getGenre(rsp),
                    LikedCount=rsp['LikedCount'],
                    idx=rsp['idx'],
                    time=rsp['time'])
         return mv
     else:
         return None
Exemplo n.º 5
0
def parseHardtiedPage(url):
    page = requests.get(url)
    tree = html.fromstring(page.content)

    # This will create a list of Videos on that page:
    movies = []
    videos = tree.xpath('//table[@class="articleWrapper"]')
    for eachVideo in videos:

        vid = movie()

        siteURL = "http://www.hardtied.com"
        # lets find a perMovie url
        postUrl = eachVideo.xpath(
            '*//span[@class="articleTitleText"]/a')[0].get("href")
        uniqueVidUrl = siteURL + postUrl
        # print(uniqueVidUrl)
        uniquePage = requests.get(uniqueVidUrl)
        uniquePageTree = html.fromstring(uniquePage.content)
        mainPartOfPage = uniquePageTree.xpath(
            '//table[@class="articleWrapper"]')[0]

        # Find Title and Actress(es)
        for x in mainPartOfPage.xpath('*//span[@class="articleTitleText"]'):
            # print(x.text_content())
            innerTextSplit = x.text_content().split("|")
            vid.title = innerTextSplit[0].strip()
            # print(innerTextSplit)
            for eachActor in innerTextSplit[1:]:
                vid.actors.append(eachActor.strip())
        # Find Date of Video
        uglyDate = mainPartOfPage.xpath(
            '*//span[@class="articlePostDateText"]')[0].text_content()
        vid.date = parse(uglyDate)

        # Find Summary:
        for x in mainPartOfPage.xpath('*//*[@class="articleCopyText"]'):
            # print(x.text_content())
            vid.summary = x.text_content()

        # Find Tags:
        try:
            tagBlock = uniquePageTree.xpath(
                '//*[@class="articleTagsText"]')[0].text
            strippedBeginning = tagBlock[5:]
            seperatedByCommas = strippedBeginning.replace("\t", "").replace(
                "\n", "").split(", ")
            AllTags = []
            for eachTag in seperatedByCommas:
                AllTags.append(eachTag.strip())
            # print("block: ",AllTags)
            vid.tags = AllTags
        except:
            pass

        # Find All Images:
        allPhotos = mainPartOfPage.xpath('*//img')
        # print(allPhotos)
        for x in allPhotos:
            url = x.get("src")
            if "poster.jpg" in url:
                # print(url)
                vid.mainImage = url
            else:
                fullSizeImage = x.getparent().get("href")
                if "images" in fullSizeImage:
                    vid.images.append(fullSizeImage)

        vid.site = siteURL

        movies.append(vid)
    return movies
Exemplo n.º 6
0
    def get_data(self, url):
        header = get_header()
        try:
            r = requests.get(url, headers=header)
        except:
            print("Connection Refused. We Ary Trying Again.")
            time.sleep(10)
            header = get_header()
            r = requests.get(url, headers=header)
        context = r.text

        m = movie()
        self.current_movie_id = m.id = int(url[33:-1])
        # title
        pattern = re.compile('<span property="v:itemreviewed">(.*)</span>')
        m.title = re.findall(pattern, context)
        while m.title == []:
            print("Can NOT Reach {}, Sending Request Again.".format(url))
            time.sleep(10)
            r = requests.get(url, headers=header)
            context = r.text
            m.title = re.findall(pattern, context)
        m.title = m.title[0]
        # year
        pattern = re.compile('<span class="year">\((\d*)\)</span>')
        m.year = re.findall(pattern, context)[0]
        # director
        pattern = re.compile('rel="v:directedBy">([^<]*)')
        m.director = re.findall(pattern, context)
        # writer
        pattern = re.compile('<a href="/celebrity/\d*/">([^<]*)</a>')
        m.writer = re.findall(pattern, context)
        # actor
        pattern = re.compile('rel="v:starring">([^<]*)</a>')
        m.actor = re.findall(pattern, context)
        # type
        pattern = re.compile('<span property="v:genre">([^<]*)</span>')
        m.type = re.findall(pattern, context)
        # country
        pattern = re.compile('<span class="pl">制片国家/地区:</span>([^<]*)')
        m.country = re.findall(pattern, context)
        # language
        pattern = re.compile('<span class="pl">语言:</span>([^<]*)')
        m.language = re.findall(pattern, context)
        # length
        if m.id == 3734350:
            m.length = 6
        elif m.id == 6146955:
            m.length = 81
        else:
            pattern = re.compile('<span property="v:runtime" content="(\d+)')
            m.length = re.findall(pattern, context)[0]
        # score
        pattern = re.compile('property="v:average">([\d\.]+)</strong>')
        m.score = re.findall(pattern, context)[0]

        self.ids.append(m.id)
        self.titles.append(m.title)
        self.years.append(m.year)
        self.directors.append(m.director)
        self.writers.append(m.writer)
        self.actors.append(m.actor)
        self.types.append(m.type)
        self.countries.append(m.country)
        self.languages.append(m.language)
        self.lengths.append(m.length)
        self.scores.append(m.score)
Exemplo n.º 7
0
        conn.close()


if __name__ == '__main__':
    DataBase.CreateDb()
    db = DataBase()
    m = movie(name='name',
              author='author',
              time='time',
              tlink='tlink',
              poster='poster',
              genre={
                  'r_doc': 'r_doc',
                  'r_sci': 'r_sci',
                  'r_mys': 'r_mys',
                  'r_thr': 'r_thr',
                  'r_act': 'r_act',
                  'r_phi': 'r_phi',
                  'r_com': 'r_com',
                  'r_min': 'r_min',
                  'r_exp': 'r_exp'
              },
              review='review',
              trailer='trailer',
              dd_link='dd_link')
    db.AddPost(m)
    db.AddPost(m)
    db.AddPost(m)
    print(db.getMovieByname('name'))
    print(db.getMovieByID(1))
    db.close()