def run(self): print("thread post article to page already started") try: db_thread = None next_article = None count = 0 #get statistic on FB, TW while self.is_running: try: next_article = self.queue.get(True) ''' sometime post bloomberg image causing error: (#100) picture URL is not properly formatted ''' if (POISON == next_article): self.stop_running() break if ( next_article.thumbnail_url is None or len(next_article.thumbnail_url) < 8):#''''bloomberg' in next_article.url or''' continue if (next_article.thumbnail_url.startswith("//")): next_article.thumbnail_url = "http:" + next_article.thumbnail_url '''put photo to fb''' result_photo = None result_photo_urls = None tk_index = -1 try: count = count + 1 tk_index= count % len(self.tokens) api = facebook.GraphAPI(self.tokens[tk_index]) result_photo = api.put_photo_url_2_page(next_article.short_description, next_article.thumbnail_url, profile_id="me", published=False) post = api.get_object(id=result_photo['id'], fields="images") result_photo_urls = str(post['images']) print("photo_Id: " + result_photo['id']) #print("result_photo_urls: " + str(result_photo_urls)) except Exception as e: #print("Error index: " + tk_index) print("Cannot post photo " + next_article.thumbnail_url + " to FB page {}".format(e)) if ((result_photo is None or result_photo['id'] is None)): print("Cannot post " + next_article.url + " to FB page") else: # save id to database '''save db post id''' try: try: db_thread = IIIDatbaseConnection() db_thread.init_database_cont() print("update fb thumnail database result: ") print(db_thread.update_article_fbid_photo(next_article.url, result_photo_urls, result_photo['id'] if result_photo is not None else None)) finally: db_thread.close_database_cont() except Exception as e: print("Error save FB id to database: {}".format(e)) except Exception as e: print("Error when post article to FB pages: {}".format(e)) sleep(0.8) print("thread post article to FB stopped") except Exception as db_e: print("Error database thread post to Facebook: {}".format(db_e))
def run(self): try: db_thread = IIIDatbaseConnection() db_thread.init_database_cont() ''' get list of existing url from db here''' running_time = 0 while self.is_running: running_time = running_time + 1 print("======================= Running time" + str(running_time) + " ==============================") cur = db_thread.cursor() cur.execute("SELECT id,text, updated_time, title, category_id FROM articles WHERE is_duplicated = 0 AND UNIX_TIMESTAMP() - last_update_statistic < " + str(4 * 60 * 60)) articles_set = cur.fetchall() print(len(articles_set)) for article1 in articles_set: for article2 in articles_set: try: similarity_content = get_consine_text(article1[1], article2[1]) similarity_title = get_consine_text(normalize_text(article1[3]), normalize_text(article2[3])) similarity = (3 * similarity_content + 2 * similarity_title)/5 if (article1[0] != article2[0] and similarity > 0.56 and abs(article1[2] - article2[2]) < DUPLICATION_TIME_POSISIBLE and article1[3] == article2[3]): print(similarity) print("gotta delete") print(article1[3] + " ====================== " + article2[3]) duplicated_id = 0 if int(article1[2]) < int(article2[2]): duplicated_id = article1[0] else: duplicated_id = article2[0] print(db_thread.update_article_duplicated(duplicated_id, True)) except Exception as e: print("Error when get cosine and delete: {}".format(e)) cur.close() print("finished: " + str(running_time)) time.sleep(UPDATE_COUNT_PERIOD) except Exception as db_e: print("Error database duplicate: {}".format(db_e)) finally: #close connection db before exit if (db_thread is not None): db_thread.close_database_cont()
def run(self): try: db_thread = IIIDatbaseConnection() db_thread.init_database_cont() next_url = None while self.is_running: #get statistic on FB, TW try: next_url = self.queue.get(True) if (next_url == None or next_url == POISON): self.stop_running() break if (not db_thread.should_update_statisics(next_url)): continue param_url = next_url if ("http://espn.go.com/" in next_url): try: param_url = next_url.split("_/id/")[0] + "_/id/" + next_url.split("_/id/")[1].split("/")[0]; except Exception as e: print("cannot split espn url") param_url = next_url fb_param = dict(method = 'links.getStats', urls = urllib.parse.quote(param_url, safe=''), format = 'json') #tw_param = dict(url=next_url) fb_resp = requests.get(url=FB_REST_API, params = fb_param) data = json.loads(fb_resp.text)[0] #tw_resp = requests.get(url=TWITTER_URL_API, params = tw_param) #twi_data = json.loads(tw_resp.text) #print(data) #print(twi_data) print(db_thread.update_article_count(next_url, data['comment_count'], data['share_count'], data['like_count'], data['comments_fbid'], 0)) except Exception as e: print("Error when get FB, TW like comment: {}".format(e)) except Exception as db_e: print("Error database thread get like_share count: {}".format(db_e)) finally: #close connection db before exit if (db_thread is not None): db_thread.close_database_cont() print("commentlikeshare thread stoppped")
print('...................................................\n' + '...................................................\n' + '...................................................\n' + 'start get articles from washington post' + '...................................................\n' + '...................................................\n' ) try: db_connect = IIIDatbaseConnection() db_connect.init_database_cont() ''' we process homepage''' for home_page in washington_post_home_pages: print("extracting: " + home_page) washington_page = requests.get(home_page) html_tree = html.fromstring(washington_page.text) article_urls = html_tree.xpath('//a/@href') for home_url in article_urls: if home_url is not None and len(home_url) > 16: if ('http://' not in home_url and 'https://' not in home_url): home_url = WASHINGTON_POST + home_url try:
@author: Vu Trong Hoa ''' from pymysql.err import MySQLError #from DatabaseConnectionLib import IIIDatbaseConnection from iiidatabase.DatabaseConnectionLib import IIIDatbaseConnection cnn_category = { 'sport': 'sport', 'world': 'world', 'tech': 'tech', 'entertainment': 'entertainment', 'opinions': 'opinions', 'more': 'others' } try: db_connect = IIIDatbaseConnection() db_connect.init_database_cont() #start query everything here #print(db_connect.insert_category('category3', 'Category3')) # print(db_connect.insert_source('usa_today', 'USA Today', 'http://www.usatoday.com')) #print(db_connect.insert_source2('vnexpress', 'VNEpress', 'http://vnexpress.net','http://graph.facebook.com/612208245488345/picture?type=large')) print( db_connect.insert_source2( 'huffington_usa', 'The Huffington Post', 'http://www.huffingtonpost.com/', 'graph.facebook.com/18468761129/picture?type=large')) #print(db_connect.insert_article('http://this_is_first_bbc_url', 'this is funny article', # '43252', 'bbc_uk', 'entertainment', 10, 10, 10, True, False)) #print(db_connect.update_article_like('http://this_is_first_bbc_url', 100)) #print(db_connect.is_url_existed('http://this_is_first_bbc_urldf'))
def run(self): try: ''' get list of existing url from db here''' running_time = 0 while self.is_running: try: running_time = running_time + 1 print("======================= Running time" + str(running_time) + " ==============================") all_urls = {} try: load_db_cont = IIIDatbaseConnection() load_db_cont.init_database_cont() cur = load_db_cont.cursor() query = "SELECT id, url, updated_time, UNIX_TIMESTAMP() - updated_time FROM articles "\ "WHERE (UNIX_TIMESTAMP() - last_time_update) > " + str(25 * 60) \ + " AND UNIX_TIMESTAMP() - updated_time < " +str(AGE_GET_UPDATE) print(query) cur.execute(query) all_urls = cur.fetchall() except Exception as e: print("Cannot read database {}".format(e)) finally: try: cur.close() load_db_cont.close_database_cont() except Exception as e: pass print(len(all_urls)) round_count = 0 urlparams = "" for r in all_urls: if (self.is_running): try: next_url = r[1] if (int(time.time()) - int(r[2]) > DELETE_PERIOD): try: delelte_cont = IIIDatbaseConnection() delelte_cont.init_database_cont() delelte_cont.delete_article(r[0]) except Exception as e: print( "Error when delete old articles: {}" .format(e)) finally: try: delelte_cont.close_database_cont() except Exception as e: pass continue if (int(time.time()) - int(r[2]) > AGE_GET_UPDATE): print("do not need to update this") continue round_count = round_count + 1 urlparams += urllib.parse.quote(next_url, safe='') + "," if (round_count == 10): round_count = 0 # reset count fb_param = dict(method='links.getStats', urls=urlparams, format='json') fb_resp = requests.get(url=FB_REST_API, params=fb_param) #print(fb_resp.text) for data in json.loads(fb_resp.text): try: save_cont = IIIDatbaseConnection() save_cont.init_database_cont() print( save_cont.update_article_count( urllib.parse.unquote( data['url']), data['comment_count'], data['share_count'], data['like_count'], data['comments_fbid'], 0)) except Exception as e: print( "Error when update count for article: {}" .format(e)) finally: try: save_cont.close_database_cont() except Exception as e: pass urlparams = "" except Exception as e: urlparams = "" print("Error when get FB, TW like comment: {}". format(e)) all_urls = None except Exception as db_e: print("Error database thread get like_share count: {}". format(db_e)) print("finsihed one round, now take sleep") time.sleep(UPDATE_COUNT_PERIOD) except Exception as db_e: print( "Error database thread get like_share count: {}".format(db_e))
def run(self): print("thread post article to page already started") try: db_thread = IIIDatbaseConnection() #get statistic on FB, TW while self.is_running: try: '''if (int(time.time()) - self.LAST_TIME_GET_TOKEN > TOKEN_TIMEOUT): with open ("../../fb_token.txt", "r") as myfile: self.FB_LONGLIVE_ACTK =myfile.read().replace('\n', '') self.LAST_TIME_GET_TOKEN = int(time.time()) print("refreshed Facebook token")''' for source_id in sources_page_id: print("=============== sourceid: " + source_id + " ===============") api = get_page_api(TOKEN, '1661911627411850') print("get post on each source") posts = api.get_object(id=source_id + '/posts') #print(posts['data']) for post in posts['data']: try: string_post_content = '' if ('story' in post): string_post_content = string_post_content + post[ 'story'] + " " if ('link' in post): string_post_content = string_post_content + " " + post[ 'link'] + " " print(post['link']) if ('message' in post): string_post_content = string_post_content + post[ 'message'] print(post['message']) if (string_post_content is not None): #print("post content: " + string_post_content) urls = re.findall(r'(https?://\S+)', string_post_content) if (urls is None or len(urls) <= 0): print("not found url") else: # save id to database for url in urls: try: db_thread.init_database_cont() url = utils.normalize_url(url) print("postId: " + post['id'] + " : " + url) print( db_thread.update_article_fbid( url, post['id'])) db_thread.close_database_cont() except Exception as e: print( "Error when save facebook post id to db: {}" .format(e)) except Exception as e: print( "Error when get url from facebook post: {}" .format(e)) except Exception as e: print( "Error when get url from facebook post: {}".format(e)) time.sleep(2 * UPDATE_COUNT_PERIOD) except Exception as db_e: print("Error database thread post to Facebook: {}".format(db_e)) finally: #close connection db before exit if (db_thread is not None): db_thread.close_database_cont()
except Exception as dbE: print("Error when insert article to db. {}".format(dbE)) # after insert to database, we put this url to get share, comment, like url_sharelikecomment_queue.put(normalized_url, True) print('...................................................\n' + '...................................................\n' + '...................................................\n' + 'start get articles from vnexpress' + '...................................................\n' + '...................................................\n' ) try: db_connect = IIIDatbaseConnection() db_connect.init_database_cont() # ''' we process homepage news''' # VNEXPRESS_HOMPAGE_NEWS = 'http://vnexpress.net/tin-tuc/thoi-su/' # vnexpress_homepage_news = requests.get(VNEXPRESS_HOMPAGE_NEWS) # html_tree = html.fromstring(vnexpress_homepage_news.text) # article_urls = html_tree.xpath('//a/@href') # for home_url in article_urls: # if home_url is not None and len(home_url) > 16 : # if ('http://' not in home_url and 'https://' not in home_url): # home_url = VNEXPRESS_HOME + home_url # try: