def detail_parse_by_subclass(self,url,page): res = self.parser.get_parse_data(url,page) urllist=[] it = Item() for l in res['list']: li = l['li'] if "地区:" in li: it.location = li.replace("地区:",'') if "年份:" in li: it.date = li.replace("年份:",'') if "导演:" in li: it.director = li.replace("导演:",'') if "主演:" in li: it.actors = li.replace("主演:",'') it.url = url quality = {} for tinfo in res['tinfo']: qs = tinfo['quality'] if '720p' in qs: quality['720p'] = 1 if '1080p' in qs: quality['1080p'] = 1 if 'CAM' in qs or 'TS' in qs: quality['TS'] = 1 it.quality = '/'.join([ k for k,v in quality.items()]) imdb_url = res['imdb'] if imdb_url != None and len(imdb_url) > 0: urllist.append("http://www.imdb.com/title/"+ imdb_url) return it ,urllist
def get_posts(self, request): all_posts = [] try: # multiply distance value by 1000 because distance is measured in # meters media_search = self.api.media_search( count=10, lat=request.form['lat'], lng=request.form['long'], distance=request.form['value'] * 1000) if (media_search): for media in media_search: if (hasattr(media, 'images') & ("standard_resolution" in media.images)): img_urls = [media.images['standard_resolution'].url] if hasattr(media.caption, 'text'): text = Util.remove_non_ascii(media.caption.text) else: text = "" tags = Util.extract_hashtags(text) item = Item(text, img_urls, tags) all_posts.append(item) except UnicodeEncodeError: pass # NOOP return all_posts
def _ensure_items(self): if self._item_map is None: self._item_map = {} for item_id, name, genres in \ self.connection.query("SELECT item_id, name, genres FROM items"): self._item_map[item_id] = Item(item_id, name, genres.split(","))
def get_items(urls): """Generate the items found at urls""" with ThreadPoolExecutor(20) as executor: futures = {executor.submit(requests.get, url): url for url in urls} for fut in as_completed(futures): soup = BeautifulSoup(fut.result().content, 'lxml') yield Item(soup.title.get_text(), soup.get_text().replace('\xa0', ' '), futures[fut])
def trim(item): title_pat = regex.compile(r'(?<=Сталин И.В. ).*') title_match = title_pat.search(item.title) new_title = item.title if title_match is None else title_match.group() text_pat = regex.compile( r'(?<=Источник:\n(.*\n){6})(?:.*\n)+?(?=\n{6}|\nПРИМЕЧАНИ[ЕЯ]\n)') text_match = text_pat.search(item.text) new_text = item.text if text_match is None else text_match.group() return Item(new_title, new_text)
def __init__(self, path: str, clip: int = 100000): self.ratings = pd.read_csv(path + "/ratings.csv", nrows=clip, dtype={ "userId": str, "movieId": str, "rating": float }) movies = pd.read_csv(path + "/movies.csv", dtype={ "movieId": str, "title": str, "genres": str }) self.movies = {item_id: Item(item_id, name, genres.split("|")) for item_id, name, genres in movies[["movieId", "title", "genres"]].values} # type: Dict[str, Item]
def search_tweets(self, lat, lng, dist): gcode = str(lat) + "," + str(lng) + "," + str(dist) + "km" public_tweets = self.api.search(geocode=gcode, count=100, include_entities=True) tags = {} tag_to_tweets = {} all_tweets = [] for tweet in public_tweets: new_tweet = Util.remove_non_ascii(tweet.text) if len(new_tweet) > 0: hashtags = Util.extract_hashtags(new_tweet) image_urls = [] if 'media' in tweet.entities: image_urls = Util.extract_image_urls( tweet.entities["media"]) all_tweets = [] text = Util.extract_text(new_tweet) tweet_object = Item(text, image_urls, hashtags) for tag in hashtags: if tag in tags: tags[tag] += 1 tag_to_tweets[tag].append(tweet_object) else: tags[tag] = 1 tag_to_tweets[tag] = [tweet_object] sorted_tags = Util.sort_dict_by_values(tags) for i in range(min(self.TOP_K, len(sorted_tags))): for tweet in tag_to_tweets[sorted_tags[i][0]]: all_tweets.append(tweet) # Remove all duplicate items return list(set(all_tweets))
if __name__ == "__main__": import common from common import Item from feature_builder import * feature_builder = FeatureBuilder() item1 = Item(1, 16, [2,3,4], 1) feature_builder.add_an_item(item1) item1.pos = 2 feature_builder.add_an_item(item1) item2 = Item(2, 15, [2,3,4], 1) feature_builder.add_an_item(item2) item2.pos = 2 feature_builder.add_an_item(item2) item3 = Item(3, 17, [2,7,4], 1) feature_builder.add_an_item(item3) item3.pos = 2 feature_builder.add_an_item(item3) dim_items_index = { item1.item_id:item1, item2.item_id:item2, item3.item_id:item3 }
def detail_parse_by_subclass(self,url,page): it = Item() it.url = url con = self.parser.get_parse_data(url,page) ename = con['name'] if ename!=None and len(ename)>0: it.ename = ename.split("(")[0].strip() else: return None,[] tlist = [] for t in con['typelist']: tlist.append(t['type']) it.type = '/'.join(tlist) imdbid = url.split("title/")[1].split("/")[0] if "tt" in imdbid: imdbid = imdbid.replace('tt','') it.id = int(imdbid) ##title/tt0061811 it.date = con['date'] #print "imdbname",it.ename #print "imdbid",it.id #print "imdb" ,it.type #print "year",con['date'] it.pic_url = con['pic_url'] if it.pic_url ==None: it.pic_url="nopic" it.rate = con['rate'] if it.rate==None: it.rate=0 it.director = con['director'] it.actors = con['actors'] if it.actors ==None: return None,[] it.actors = it.actors.split("|")[0].replace("Stars:",'').strip() info = con['box'] # Budget: $170,000,000 (estimated) if "Budget:" in info: box = info.split("Budget:")[1].split("(estimated)")[0] box = box.replace(",",'').strip() p = re.compile(r'([\d]+)') match = p.search(box) if match != None: it.box = int(match.group())/10000 else: it.box =0 return it,[]
def load_items(): for n in range(18): with open(f'raw_tomes/t{n+1}.pickle', 'rb') as f: for v in pickle.load(f).values(): yield Item(*v.values())
def remove_initials(item): pat = regex.compile(r'\p{lu}\.\p{lu}\. (?=\w)') new_text = regex.sub(pat, r'', item.text) return Item(item.title, new_text)
def remove_page_numbers(item): pat = regex.compile(r'\[c\..{,6}\]') new_text = regex.sub(pat, r'', item.text) return Item(item.title, new_text)
def remove_returns(item): pat = regex.compile(r'\r') new_text = regex.sub(pat, r'', item.text) return Item(item.title, new_text)
def detail_parse_by_subclass(self, url, page): it = Item() newurl = [] flist = url.split('/') it.id = flist[-1] #print page doc = pyq(page) tmp = doc('div[id=info]') for v in tmp: #print pyq(v).html().encode("UTF-8") info = pyq(v).text().encode("UTF-8") #print info idx = str(info).find("制片国家/地区:") idx2 = str(info).find("语言:") if idx > 0 and idx2 > 0: it.location = info[idx + len("制片国家/地区:"):idx2] #print it.location tl = pyq(v)('span[property=\'v:genre\']') for t in tl: it.type += '/' + pyq(t).text().encode("UTF-8") if "集数" in info or "单集片长" in info: it.channel = 1 idx = str(info).find("编剧") idx2 = str(info).find("主演") if idx > 0 and idx2 > 0: it.writer = info[idx + len("编剧"):idx2] idx = str(info).find("又名") idx2 = str(info).find("IMDb") if idx > 0 and idx2 > 0: it.aname = info[idx + len("又名") + 1:idx2] #print it.aname it.runtime = "null" runtv = pyq(v)('span[property=\'v:runtime\']') if runtv is not None and runtv.text() is not None: it.runtime = runtv.text().encode("UTF-8") it.director = "null" director = pyq(v)('a[rel=\'v:directedBy\']') if director is not None and director.text() is not None: it.director = director.text().encode("UTF-8") ac = pyq(v)('a[rel=\'v:starring\']') for actor in ac: it.actors += "/" + pyq(actor).text().encode("UTF-8") st = pyq(v)('span[property=\'v:initialReleaseDate\']') it.date = "0" if st is not None and st.text() is not None: it.date = st.text().encode("UTF-8") al = pyq(v)('a[rel=\'nofollow\']') for a in al: name = pyq(a).attr('href').encode("UTF-8") index = str(name).find("imdb") if index >= 0: it.imdb_link = name newurl.append(it.imdb_link) #print it.imdb_link imgdiv = doc('div[id=mainpic]') img = imgdiv('img[rel=\'v:image\']') if img is not None: it.pic_url = img.attr('src') #print it.pic_url it.summary = "NULL" smy = doc('span[property=\'v:summary\']') if smy is not None and smy.text() is not None: it.summary = smy.text().encode("UTF-8") #print it.summary smy = doc('div[id=review_section]') if smy is not None: tt = smy('div').eq(1) aa = tt('h2')('a') if aa is not None: it.comment_link = aa.attr('href') #print it.comment_link it.rate = "0" rate = doc('strong[property=\'v:average\']') if rate is not None and rate.text() is not None: it.rate = rate.text().encode("UTF-8") it.votes = "0" votes = doc('span[property=\'v:votes\']') if votes is not None and votes.text() is not None: it.votes = votes.text().encode("UTF-8") namestr = doc('meta[name=\'keywords\']') if len(namestr) > 0: s = namestr.attr('content').encode("UTF-8").split(",") it.cname = s[0] it.ename = s[1] return it, newurl
def detail_parse_by_subclass(self,url,page): res = self.parser.get_parse_data(url,page) con = res['content'] strlist = con.split("◎") newurl = [] it = None if len(strlist)>2: it = Item() it.url = url for s in strlist: if "译 名" in s: it.cname = s.split("译 名")[1].strip() if "片 名" in s: it.ename = s.split("片 名")[1].strip() if "年 代" in s: it.date = s.split("年 代")[1].strip() if "国 家" in s: it.location = s.split("国 家")[1].strip() if "上映时期" in s: it.date = s.split("上映时期")[1].strip() elif "上映日期" in s: it.date = s.split("上映日期")[1].strip() if "链接" in s: u = s.split("链接")[1].strip() if "http://" in u: pos = u.find("http://") newurl.append((u[pos:]).strip('/')) if "导 演" in s: it.director = s.split("导 演")[1].strip() if "主 演" in s: it.actors = s.split("主 演")[1].strip() elif "演 员" in s: it.actors = s.split("演 员")[1].strip() else: it = Item() it.url = url it.content = con return it ,newurl
def detail_parse_by_subclass(self,url,page): it = Item() newurl = [] flist = url.split('/') it.id = flist[-1] #print page doc = pyq(page) tmp = doc('div[id=info]') for v in tmp: #print pyq(v).html().encode("UTF-8") info= pyq(v).text().encode("UTF-8") #print info idx = str(info).find("制片国家/地区:") idx2 = str(info).find("语言:") if idx >0 and idx2 >0: it.location = info[idx +len("制片国家/地区:"):idx2] #print it.location tl = pyq(v)('span[property=\'v:genre\']') for t in tl: it.type += '/' +pyq(t).text().encode("UTF-8") if "集数" in info or "单集片长" in info: it.channel =1 idx = str(info).find("编剧") idx2 = str(info).find("主演") if idx >0 and idx2 >0: it.writer = info[idx +len("编剧"):idx2] idx = str(info).find("又名") idx2 = str(info).find("IMDb") if idx >0 and idx2 >0: it.aname = info[idx +len("又名") +1 :idx2] #print it.aname it.runtime = "null" runtv = pyq(v)('span[property=\'v:runtime\']') if runtv is not None and runtv.text() is not None: it.runtime = runtv.text().encode("UTF-8") it.director = "null" director = pyq(v)('a[rel=\'v:directedBy\']') if director is not None and director.text() is not None: it.director = director.text().encode("UTF-8") ac = pyq(v)('a[rel=\'v:starring\']') for actor in ac: it.actors += "/"+pyq(actor).text().encode("UTF-8") st = pyq(v)('span[property=\'v:initialReleaseDate\']') it.date = "0" if st is not None and st.text() is not None: it.date = st.text().encode("UTF-8") al= pyq(v)('a[rel=\'nofollow\']') for a in al: name= pyq(a).attr('href').encode("UTF-8") index = str(name).find("imdb") if index >=0: it.imdb_link = name newurl.append(it.imdb_link) #print it.imdb_link imgdiv = doc('div[id=mainpic]') img = imgdiv('img[rel=\'v:image\']') if img is not None: it.pic_url = img.attr('src') #print it.pic_url it.summary = "NULL" smy = doc('span[property=\'v:summary\']') if smy is not None and smy.text() is not None: it.summary = smy.text().encode("UTF-8") #print it.summary smy = doc('div[id=review_section]') if smy is not None: tt = smy('div').eq(1) aa = tt('h2')('a') if aa is not None: it.comment_link=aa.attr('href') #print it.comment_link it.rate="0" rate = doc('strong[property=\'v:average\']') if rate is not None and rate.text() is not None : it.rate = rate.text().encode("UTF-8") it.votes = "0" votes = doc('span[property=\'v:votes\']') if votes is not None and votes.text() is not None: it.votes = votes.text().encode("UTF-8") namestr = doc('meta[name=\'keywords\']') if len(namestr)>0: s = namestr.attr('content').encode("UTF-8").split(",") it.cname= s[0] it.ename= s[1] return it ,newurl
def remove_newlines(item): new_text = item.text.replace('\n', ' ') new_text = new_text.replace(' ', ' ') return Item(item.title, new_text)