Пример #1
0
    def detail_parse_by_subclass(self,url,page):

        res =  self.parser.get_parse_data(url,page)
        urllist=[]
        it = Item()
        for l in res['list']:
            li = l['li']
            if "地区:" in li:
                it.location = li.replace("地区:",'')
            if "年份:" in li:
                it.date = li.replace("年份:",'')
            if "导演:" in li:
                it.director = li.replace("导演:",'')
            if "主演:" in li:
                it.actors = li.replace("主演:",'')
            it.url = url
        quality = {}
        for tinfo in res['tinfo']:
            qs = tinfo['quality']
            if '720p' in qs:
                quality['720p'] = 1

            if '1080p' in qs:
                quality['1080p'] = 1

            if 'CAM' in qs or 'TS' in qs:
                quality['TS'] = 1
        it.quality = '/'.join([ k for k,v in quality.items()])

        imdb_url = res['imdb']
        if imdb_url != None and len(imdb_url) > 0:
            urllist.append("http://www.imdb.com/title/"+ imdb_url)
        return it ,urllist
Пример #2
0
    def get_posts(self, request):
        all_posts = []

        try:
            # multiply distance value by 1000 because distance is measured in
            # meters
            media_search = self.api.media_search(
                count=10,
                lat=request.form['lat'],
                lng=request.form['long'],
                distance=request.form['value'] * 1000)

            if (media_search):
                for media in media_search:
                    if (hasattr(media, 'images') &
                        ("standard_resolution" in media.images)):
                        img_urls = [media.images['standard_resolution'].url]

                    if hasattr(media.caption, 'text'):
                        text = Util.remove_non_ascii(media.caption.text)
                    else:
                        text = ""

                    tags = Util.extract_hashtags(text)

                    item = Item(text, img_urls, tags)

                    all_posts.append(item)

        except UnicodeEncodeError:
            pass  # NOOP

        return all_posts
 def _ensure_items(self):
     if self._item_map is None:
         self._item_map = {}
         for item_id, name, genres in \
                 self.connection.query("SELECT item_id, name, genres FROM items"):
             self._item_map[item_id] = Item(item_id, name,
                                            genres.split(","))
Пример #4
0
def get_items(urls):
    """Generate the items found at urls"""
    with ThreadPoolExecutor(20) as executor:
        futures = {executor.submit(requests.get, url): url for url in urls}

        for fut in as_completed(futures):
            soup = BeautifulSoup(fut.result().content, 'lxml')
            yield Item(soup.title.get_text(),
                       soup.get_text().replace('\xa0', ' '), futures[fut])
Пример #5
0
def trim(item):
    title_pat = regex.compile(r'(?<=Сталин И.В. ).*')
    title_match = title_pat.search(item.title)
    new_title = item.title if title_match is None else title_match.group()

    text_pat = regex.compile(
        r'(?<=Источник:\n(.*\n){6})(?:.*\n)+?(?=\n{6}|\nПРИМЕЧАНИ[ЕЯ]\n)')
    text_match = text_pat.search(item.text)
    new_text = item.text if text_match is None else text_match.group()

    return Item(new_title, new_text)
Пример #6
0
    def __init__(self, path: str, clip: int = 100000):
        self.ratings = pd.read_csv(path + "/ratings.csv", nrows=clip, dtype={
            "userId": str,
            "movieId": str,
            "rating": float
        })

        movies = pd.read_csv(path + "/movies.csv", dtype={
            "movieId": str,
            "title": str,
            "genres": str
        })

        self.movies = {item_id: Item(item_id, name, genres.split("|"))
                       for item_id, name, genres in movies[["movieId", "title", "genres"]].values}  # type: Dict[str, Item]
Пример #7
0
    def search_tweets(self, lat, lng, dist):
        gcode = str(lat) + "," + str(lng) + "," + str(dist) + "km"
        public_tweets = self.api.search(geocode=gcode,
                                        count=100,
                                        include_entities=True)
        tags = {}
        tag_to_tweets = {}
        all_tweets = []

        for tweet in public_tweets:
            new_tweet = Util.remove_non_ascii(tweet.text)
            if len(new_tweet) > 0:
                hashtags = Util.extract_hashtags(new_tweet)
                image_urls = []
                if 'media' in tweet.entities:
                    image_urls = Util.extract_image_urls(
                        tweet.entities["media"])
                    all_tweets = []

                text = Util.extract_text(new_tweet)
                tweet_object = Item(text, image_urls, hashtags)

                for tag in hashtags:
                    if tag in tags:
                        tags[tag] += 1
                        tag_to_tweets[tag].append(tweet_object)
                    else:
                        tags[tag] = 1
                        tag_to_tweets[tag] = [tweet_object]

        sorted_tags = Util.sort_dict_by_values(tags)

        for i in range(min(self.TOP_K, len(sorted_tags))):
            for tweet in tag_to_tweets[sorted_tags[i][0]]:
                all_tweets.append(tweet)

        # Remove all duplicate items
        return list(set(all_tweets))
Пример #8
0




	

    
if __name__ == "__main__":

    import common
    from common import Item
    from feature_builder import *
    feature_builder = FeatureBuilder()

    item1 = Item(1, 16, [2,3,4], 1)
    feature_builder.add_an_item(item1)
    item1.pos = 2
    feature_builder.add_an_item(item1)
    item2 = Item(2, 15, [2,3,4], 1)
    feature_builder.add_an_item(item2)
    item2.pos = 2
    feature_builder.add_an_item(item2)
    item3 = Item(3, 17, [2,7,4], 1)
    feature_builder.add_an_item(item3)
    item3.pos = 2
    feature_builder.add_an_item(item3)


    dim_items_index = { item1.item_id:item1, item2.item_id:item2, item3.item_id:item3 }
Пример #9
0
    def detail_parse_by_subclass(self,url,page):
        it = Item()
        it.url = url
        con = self.parser.get_parse_data(url,page)
        ename =  con['name']
        if ename!=None and len(ename)>0:
            it.ename = ename.split("(")[0].strip()
        else:
            return None,[]
        tlist = []
        for t in con['typelist']:
            tlist.append(t['type'])

        it.type = '/'.join(tlist)
        imdbid = url.split("title/")[1].split("/")[0]
        if "tt" in imdbid:
            imdbid = imdbid.replace('tt','')
        it.id = int(imdbid)
    ##title/tt0061811
        it.date = con['date']
        #print "imdbname",it.ename
        #print "imdbid",it.id
        #print "imdb" ,it.type
        #print "year",con['date']
        it.pic_url = con['pic_url']
        if it.pic_url ==None:
            it.pic_url="nopic"
        it.rate = con['rate']
        if it.rate==None:
            it.rate=0
        it.director = con['director']
        it.actors = con['actors']
        if it.actors ==None:
            return None,[]
        it.actors = it.actors.split("|")[0].replace("Stars:",'').strip()
        info =  con['box']
    #    Budget: $170,000,000 (estimated)
        if "Budget:" in info:
            box = info.split("Budget:")[1].split("(estimated)")[0]
            box = box.replace(",",'').strip()
            p = re.compile(r'([\d]+)') 
            match = p.search(box)
            if match != None:
                it.box = int(match.group())/10000
            else:
                it.box =0
        return it,[] 
Пример #10
0
def load_items():
    for n in range(18):
        with open(f'raw_tomes/t{n+1}.pickle', 'rb') as f:
            for v in pickle.load(f).values():
                yield Item(*v.values())
Пример #11
0
def remove_initials(item):
    pat = regex.compile(r'\p{lu}\.\p{lu}\. (?=\w)')
    new_text = regex.sub(pat, r'', item.text)

    return Item(item.title, new_text)
Пример #12
0
def remove_page_numbers(item):
    pat = regex.compile(r'\[c\..{,6}\]')
    new_text = regex.sub(pat, r'', item.text)

    return Item(item.title, new_text)
Пример #13
0
def remove_returns(item):
    pat = regex.compile(r'\r')
    new_text = regex.sub(pat, r'', item.text)

    return Item(item.title, new_text)
Пример #14
0
    def detail_parse_by_subclass(self, url, page):
        it = Item()
        newurl = []
        flist = url.split('/')
        it.id = flist[-1]

        #print page
        doc = pyq(page)
        tmp = doc('div[id=info]')
        for v in tmp:
            #print pyq(v).html().encode("UTF-8")
            info = pyq(v).text().encode("UTF-8")
            #print info
            idx = str(info).find("制片国家/地区:")
            idx2 = str(info).find("语言:")
            if idx > 0 and idx2 > 0:
                it.location = info[idx + len("制片国家/地区:"):idx2]
                #print it.location

            tl = pyq(v)('span[property=\'v:genre\']')
            for t in tl:
                it.type += '/' + pyq(t).text().encode("UTF-8")

            if "集数" in info or "单集片长" in info:
                it.channel = 1
            idx = str(info).find("编剧")
            idx2 = str(info).find("主演")
            if idx > 0 and idx2 > 0:
                it.writer = info[idx + len("编剧"):idx2]

            idx = str(info).find("又名")
            idx2 = str(info).find("IMDb")
            if idx > 0 and idx2 > 0:
                it.aname = info[idx + len("又名") + 1:idx2]
                #print it.aname
            it.runtime = "null"
            runtv = pyq(v)('span[property=\'v:runtime\']')
            if runtv is not None and runtv.text() is not None:
                it.runtime = runtv.text().encode("UTF-8")

            it.director = "null"
            director = pyq(v)('a[rel=\'v:directedBy\']')
            if director is not None and director.text() is not None:
                it.director = director.text().encode("UTF-8")

            ac = pyq(v)('a[rel=\'v:starring\']')
            for actor in ac:
                it.actors += "/" + pyq(actor).text().encode("UTF-8")

            st = pyq(v)('span[property=\'v:initialReleaseDate\']')
            it.date = "0"
            if st is not None and st.text() is not None:
                it.date = st.text().encode("UTF-8")

            al = pyq(v)('a[rel=\'nofollow\']')
            for a in al:
                name = pyq(a).attr('href').encode("UTF-8")
                index = str(name).find("imdb")
                if index >= 0:
                    it.imdb_link = name
                    newurl.append(it.imdb_link)
                    #print it.imdb_link

        imgdiv = doc('div[id=mainpic]')
        img = imgdiv('img[rel=\'v:image\']')
        if img is not None:
            it.pic_url = img.attr('src')
            #print it.pic_url

        it.summary = "NULL"
        smy = doc('span[property=\'v:summary\']')
        if smy is not None and smy.text() is not None:
            it.summary = smy.text().encode("UTF-8")
            #print it.summary

        smy = doc('div[id=review_section]')
        if smy is not None:
            tt = smy('div').eq(1)
            aa = tt('h2')('a')
            if aa is not None:
                it.comment_link = aa.attr('href')
                #print it.comment_link

        it.rate = "0"
        rate = doc('strong[property=\'v:average\']')
        if rate is not None and rate.text() is not None:
            it.rate = rate.text().encode("UTF-8")
        it.votes = "0"
        votes = doc('span[property=\'v:votes\']')
        if votes is not None and votes.text() is not None:
            it.votes = votes.text().encode("UTF-8")

        namestr = doc('meta[name=\'keywords\']')
        if len(namestr) > 0:
            s = namestr.attr('content').encode("UTF-8").split(",")
            it.cname = s[0]
            it.ename = s[1]

        return it, newurl
Пример #15
0
    def detail_parse_by_subclass(self,url,page):

        res =  self.parser.get_parse_data(url,page)
        con = res['content']
        strlist = con.split("◎")
        newurl = []
        it = None
        if len(strlist)>2:
            it = Item()
            it.url = url
            for s in strlist:
                if "译  名" in s:
                    it.cname = s.split("译  名")[1].strip()
                if "片  名" in s:
                    it.ename = s.split("片  名")[1].strip()
                if "年  代" in s:
                    it.date = s.split("年  代")[1].strip()
                if "国  家" in s:
                    it.location = s.split("国  家")[1].strip()
                if "上映时期" in s:
                    it.date = s.split("上映时期")[1].strip()
                elif "上映日期" in s:
                    it.date = s.split("上映日期")[1].strip()
                if "链接" in s:
                    u = s.split("链接")[1].strip()
                    if "http://" in u:
                        pos = u.find("http://")
                        newurl.append((u[pos:]).strip('/'))
                if "导  演" in s:
                    it.director = s.split("导  演")[1].strip()
                if "主  演" in s:
                    it.actors = s.split("主  演")[1].strip()
                elif "演  员" in s:
                    it.actors = s.split("演  员")[1].strip()
        else:
            it = Item()
            it.url = url
            it.content = con
    
        return it ,newurl  
Пример #16
0
    def detail_parse_by_subclass(self,url,page):
        it = Item()
        newurl = []
        flist = url.split('/')
        it.id = flist[-1]

        #print page
        doc = pyq(page)
        tmp = doc('div[id=info]')
        for v in tmp:
            #print pyq(v).html().encode("UTF-8")
                info= pyq(v).text().encode("UTF-8")
                #print info
                idx = str(info).find("制片国家/地区:")
                idx2 = str(info).find("语言:")
                if idx >0 and idx2 >0:
                    it.location = info[idx +len("制片国家/地区:"):idx2]
                    #print it.location

                tl = pyq(v)('span[property=\'v:genre\']')
                for t in tl:
                    it.type += '/' +pyq(t).text().encode("UTF-8")

                if "集数" in info or "单集片长" in info:
                    it.channel =1
                idx = str(info).find("编剧")
                idx2 = str(info).find("主演")
                if idx >0 and idx2 >0:
                    it.writer = info[idx +len("编剧"):idx2]

                idx = str(info).find("又名")
                idx2 = str(info).find("IMDb")
                if idx >0 and idx2 >0:
                    it.aname = info[idx +len("又名") +1 :idx2]
                        #print it.aname
                it.runtime = "null"
                runtv = pyq(v)('span[property=\'v:runtime\']')
                if runtv is not None and runtv.text() is not None:
                    it.runtime = runtv.text().encode("UTF-8")

                it.director = "null"
                director = pyq(v)('a[rel=\'v:directedBy\']')
                if director is not None and director.text() is not None:
                    it.director = director.text().encode("UTF-8")

                ac = pyq(v)('a[rel=\'v:starring\']')
                for actor in ac:
                    it.actors += "/"+pyq(actor).text().encode("UTF-8")

                st = pyq(v)('span[property=\'v:initialReleaseDate\']')
                it.date = "0"
                if st is not None and st.text() is not None:
                    it.date = st.text().encode("UTF-8")

                al= pyq(v)('a[rel=\'nofollow\']')
                for a in al:
                    name= pyq(a).attr('href').encode("UTF-8")
                    index =  str(name).find("imdb") 
                    if index >=0:
                        it.imdb_link = name
                        newurl.append(it.imdb_link)
                                #print it.imdb_link


        imgdiv = doc('div[id=mainpic]')
        img = imgdiv('img[rel=\'v:image\']')
        if img is not None:
            it.pic_url = img.attr('src')
                #print it.pic_url
        
        it.summary = "NULL"
        smy = doc('span[property=\'v:summary\']')
        if smy is not None and smy.text() is not None:
            it.summary = smy.text().encode("UTF-8")
                #print it.summary

        smy = doc('div[id=review_section]')
        if smy is not None:
            tt = smy('div').eq(1)
            aa = tt('h2')('a')	
            if aa is not None:
                it.comment_link=aa.attr('href')
                #print it.comment_link

        it.rate="0"
        rate = doc('strong[property=\'v:average\']')
        if rate is not None  and rate.text() is not None :
            it.rate = rate.text().encode("UTF-8")
        it.votes = "0"
        votes = doc('span[property=\'v:votes\']')
        if votes is not None and votes.text() is not None:
            it.votes = votes.text().encode("UTF-8")

        namestr = doc('meta[name=\'keywords\']')
        if len(namestr)>0:
            s = namestr.attr('content').encode("UTF-8").split(",")	
            it.cname= s[0]
            it.ename= s[1]
    
        return it ,newurl  
Пример #17
0
def remove_newlines(item):
    new_text = item.text.replace('\n', ' ')
    new_text = new_text.replace('  ', ' ')

    return Item(item.title, new_text)