def get_data_reddit(): for url in REDDIT_RSS: data = rss_data(url) for content in data: print 'Storing data from ' + url print 'TITLE:' + content['title'] + ' URL: ' + content['url'] print '------------------------------------------------------' time.sleep(0.5) Content.create_or_update_content(db.session,**content) db.session.commit()
def get_filtered_content(page): types = request.args.getlist("type") tags = request.args.getlist("tag") query = request.args.get("query") contents = [] if tags: contents.extend([content.fp_serialize for content in Content.get_top_tag_filtered(page, tags)]) elif types: contents.extend([content.fp_serialize for content in Content.get_top_type_filtered(page, types)]) elif query: contents.extend([content.fp_serialize for content in Content.get_top_for_query(query, page)]) return jsonify({"results": contents})
def get_top_content(): valid_cookie = is_cookie_valid() history = get_history() if valid_cookie else [] page_no = get_page_no() if valid_cookie else None if len(history) < 360: contents = [content.fp_serialize for content in Content.get_top_unviewed(history)] else: if not page_no: page_no = 1 contents = [content.fp_serialize for content in Content.get_top_by_pages(page_no, history)] response = jsonify({"results": contents}) if not valid_cookie: reset_cookie(response) set_cookie_time(response) return response
def get_heading_feature(content_id=0, content_data='', html_data=''): """ status: optional """ soup_data = '' #get data if content_id: html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #extract h tags and features heading_data = get_heading_word(soup_data) heading_dict = text_analysis(heading_data, var_name='heading') return heading_dict
def is_duplicate_content(feed): # TODO: If there is still duplicate, we can check using raw title and domain feed_id = get_feed_id(feed) content = Content.get_content_by_feed_id(feed_id) if content: return True return False
def get_url_feature(content_id=0, content_data=''): """ status: required @todo: """ url = '' #get data if content_id: url = Content.get_content_by_id(content_id).url elif content_data: url = content_data if not url: return {} #extract text from url path features url_path = link_to_text(url) text_result = text_analysis(url_path, var_name='url', head_body='head') #extract url features url_result = url_analysis(url, var_name='url') #combine dict data url_dict = dict(text_result.items() + url_result.items()) return url_dict
def update_parent_cluster(clusters, contents, session): """ """ id_cluster = format_cluster(clusters) content_ids = [row[0] for row in contents] for content_id in content_ids: content = Content.get_content_by_id(content_id) if not content.parent_cluster: content.parent_cluster = id_cluster[content_id] if content_id in id_cluster else 0 session.add(content) else: #update contents previously clustered if content_id in id_cluster: clustered_contents = Content.get_content_by_parent_cluster(content.parent_cluster) for clustered_content in clustered_contents: clustered_content.parent_cluster = id_cluster[content_id] session.add(clustered_content) session.commit()
def rank_contents(): logger.info('starting content ranking...') start_time = time.time() contents = Content.get_content_for_ranking(3) for content in contents: content.rank = rank_content(content) session.add(content) session.commit() logger.info('ranking completed in %s', str(timedelta(seconds=time.time() - start_time)))
def requestRssData(url, google=False, newsvine=False, fark=False, force_refresh=False): content = feedparser.parse(url) data = [] for i in content.entries: dictData = {} if not force_refresh and Content.get_content_by_url(i.link): print 'content in database, continue..' continue dictData['raw_url'] = i.link dictData['description'] = cleanSoupHtml(i.description).get_text() dictData['title'] = i.title try: dictData['timestamp'] = i.published_parsed except: continue #check if rss from google news rss, clean url if google: url_primary = i.link.split('&url=')[1] url_secondary = '' elif newsvine: try: soup = BeautifulSoup(requests.get(i.link).text) url_primary = soup.find('span', {'class': 'c-seed-source'}).a['href'] url_secondary = '' except: print 'problem opening newsvine link' continue elif fark: dictData['title'] = dictData['title'].split('[')[0] try: soup = BeautifulSoup(requests.get(i.id).text) url_temp = soup.find('a', {'class': 'outbound_link'})['href'] url_primary = url_temp[url_temp.rfind('http'):] url_secondary = '' except: print 'problem opening fark link' continue else: try: url_primary = i.id url_secondary = i.link except AttributeError: url_primary = i.link url_secondary = '' except: continue #get the content if url is valid urlContentData = url_content(url_primary, url_secondary) if urlContentData: dictData = dict(dictData.items() + urlContentData.items()) else: continue data.append(dictData) time.sleep(1) return data
def get_text(content_id): raw_html = Content.get_raw_html_by_id(content_id) try: text = Extractor(extractor='ArticleExtractor', html=raw_html).getText() except Exception as e: logger.exception('\nError extracting text from html. Exception: %s, %s', e.__class__.__name__, e) return '' return text
def get_anchor_feature(content_id=0, content_data='', html_data=''): """ a tags in p tags @status: optional @param content_data: BeautifulSoup object @param html_data: html string @todo: """ soup_data = '' anchor_text = [] anchor_link = [] #get data if content_id: raw_html = Content.get_raw_html_by_id(content_id) try: soup_data = BeautifulSoup(raw_html) except: pass elif content_data: soup_data = content_data elif html_data: try: soup_data = BeautifulSoup(html_data) except: pass if not soup_data: soup_data = BeautifulSoup('') #get a tags links and text anchor_data = get_anchor(soup_data) for a in anchor_data: if a.string: anchor_text += a.string.split() if a.has_attr('href'): anchor_link.append(a['href']) #run thru text analysis even with empty soup cause of the loops(fixed bug) if not anchor_link: anchor_link = [''] #extract text features anchor_text = ' '.join(anchor_text) text_result = text_analysis(anchor_text, var_name='anchor') #extract link features url_list = [] for link in anchor_link: url_list.append(url_analysis(link, var_name='anchor')) url_result = sum_list_dict(url_list, 'anchor') anchor_dict = dict(text_result.items() + url_result.items()) return anchor_dict
def load_database(force_refresh=False): sources = ContentSource.query.all() for source in sources: url = source.url if 'google.com' in url: data = requestRssData(url, google=True, force_refresh=force_refresh) elif 'newsvine.com' in url: data = requestRssData(url, newsvine=True, force_refresh=force_refresh) elif 'feedsportal.com/c/35344/f' in url: data = requestRssData(url, fark=True, force_refresh=force_refresh) elif 'reddit.com' in url: data = rss_data(url) else: data = requestRssData(url, force_refresh=force_refresh) for content in data: content['source_id'] = source.id print 'Storing ' + content['url'] + ' from ' + url + ' ...' Content.create_or_update_content(db.session, **content) db.session.commit() print 'done loading database...'
def get_body(): """ parameters: html - extract text from html returns: <text><social shares> """ result = [] ids = [content.id for content in Content.query.all()] for content_id in ids: content_data = Content.get_content_by_id(content_id) result.append([html_to_text(content_data), content_data.real_shares]) return result
def get_description_feature(content_id=0, content_data=''): """ status: optional @todo: """ desc_data = '' #get data if content_id: desc_data = Content.get_content_by_id(content_id).description #use title data for analysis if not desc_data: desc_data = Content.get_content_by_id(content_id).title elif content_data: desc_data = content_data if not desc_data: desc_data = '' #extact feature desc_dict = text_analysis(desc_data, var_name='desc', head_body='head') return desc_dict
def populate_real_shares(): logger.info('starting real shares population...') start_time = time.time() contents = Content.get_content_no_real_shares_by_age(3) for content in contents: try: total_share = get_total_shares(content.url) except Exception: logger.exception('Unable to fetch real shares for content with id %s', content.id) continue content.real_shares = total_share session.add(content) session.commit() logger.info('real shares population completed in %s', str(timedelta(seconds=time.time() - start_time)))
def populate_social_count(): logger.info('starting social count population...') start_time = time.time() contents = Content.get_unranked_contents_by_age(1) for content in contents: try: social_share = get_social_share(content.url) except Exception: logger.exception('Unable to fetch social count for content with id: %s.', content.id) continue social_share.content_id = content.id content.predicted_shares = predicted_shares(social_share, content) session.add(social_share) session.add(content) session.commit() logger.info('social count population completed in %s', str(timedelta(seconds=time.time() - start_time)))
def populate_real_shares(): logger.info('starting real shares population...') start_time = time.time() contents = Content.get_content_no_real_shares_by_age(3) for content in contents: try: total_share = get_total_shares(content.url) except Exception: logger.exception( 'Unable to fetch real shares for content with id %s', content.id) continue content.real_shares = total_share session.add(content) session.commit() logger.info('real shares population completed in %s', str(timedelta(seconds=time.time() - start_time)))
def get_icon_feature(content_id=0, content_data=''): """ status: optional """ data = '' #get data if content_id: data = Content.get_content_by_id(content_id).icon_url elif content_data: data = content_data #return result if data: return {'icon': 1} else: return {'icon': 0}
def get_title_feature(content_id=0, content_data=''): """ status: required """ title = '' #get data if content_id: title = Content.get_content_by_id(content_id).title elif content_data: title = content_data if not title: return {} #extract text feature title_dict = text_analysis(title, var_name='title', head_body='head') return title_dict
def cluster_content(): """ @processes: 1.get data: new content + top n contents 2.cluster 3.determine parent cluster 4.update parent cluster 5.store results @todo: 1.use better feature vectorizer (word2vec?) 2.display the cluster """ logger.info('starting clustering sequence...') start_time = time.time() contents = Content.get_content_for_clustering() clusters = cluster_news(contents, train=True) update_parent_cluster(clusters, contents, session) logger.info('clustering completed in %s', str(timedelta(seconds=time.time() - start_time)))
def populate_social_count(): logger.info('starting social count population...') start_time = time.time() contents = Content.get_unranked_contents_by_age(1) for content in contents: try: social_share = get_social_share(content.url) except Exception: logger.exception( 'Unable to fetch social count for content with id: %s.', content.id) continue social_share.content_id = content.id content.predicted_shares = predicted_shares(social_share, content) session.add(social_share) session.add(content) session.commit() logger.info('social count population completed in %s', str(timedelta(seconds=time.time() - start_time)))
def get_data(news_number=1000): """ todo: get text from db? html? """ data = [] counter = 0 ids = id_from_database() for news_id in ids: if counter >= news_number: break content = Content.get_content_by_id(news_id) title = content.title description = content.description if not description: description = title data.append([news_id, title, description]) counter += 1 return data
def get_html_feature(content_id=0, content_data='', html_data=''): """ status: required @param content_data: BeautifulSoup object @param html_data: html string @TODO: image, video, etc.. """ html_dict = {} soup = '' #get data if content_id: html_data = Content.get_raw_html_by_id(content_id) try: soup = BeautifulSoup(html_data) except: pass elif content_data: soup = content_data elif html_data: try: soup = BeautifulSoup(html_data) except: pass if not soup: return {} #extract features html_dict['html_num'] = get_html_num(soup) html_dict['html_h'] = get_html_tags(soup, HTML_HEAD) html_dict['html_a'] = get_html_tags(soup, HTML_A) html_dict['html_p'] = get_html_tags(soup, HTML_P) html_dict['html_embed'] = get_html_tags(soup, HTML_EMBED) html_dict['html_style'] = get_html_tags(soup, HTML_STYLE) html_dict['html_layout'] = get_html_tags(soup, HTML_LAYOUT) html_dict['html_meta'] = get_html_tags(soup, HTML_META) html_dict['html_input'] = get_html_tags(soup, HTML_INPUT) html_dict['html_script'] = get_html_tags(soup, HTML_SCRIPT) return html_dict
def get_timestamp_feature(content_id=0, content_data=''): """ status: required @todo: """ date_dict = {} time_data = '' #get data if content_id: time_data = Content.get_content_by_id(content_id).timestamp elif content_data: time_data = content_data if not time_data: return {} #extract feature date_dict['timestamp_day'] = day_published(time_data) date_dict['timestamp_hour'] = hour_published(time_data) return date_dict
def get_content_type_feature(content_id=0, content_data=''): """ status: optional """ data = 0 #get data if content_id: data_temp = Content.get_content_by_id(content_id).type_id if data_temp: if data_temp.isdigit(): data = int(data_temp) elif content_data: data = content_data if not data: content_type = 0 else: content_type = data.id return {'content_type': content_type}
def is_duplicate_url(self): content = Content.get_content_by_link(self.url) if content: return True return False
def generate_content(content_data, source_id, session): content = Content() content.url = content_data.url content.feed_id = get_feed_id(content_data.feed) content.title = content_data.title content.description = content_data.description content.image_url = content_data.image_url content.icon_url = content_data.icon_url content.timestamp = content_data.timestamp content.content_source_id = source_id content.tags = get_tags(content_data, session) content.site_name = get_site_name(content_data.soup, session) content.type = content_data.type features = Extract(content_data) content.feature_extraction = features.get_feature(convert_string=True) return content