def configSQL(self): connector = MySQLConnector(LOGIN_INFO_PATH) self.sqlConn = connector.connect() # create database if not exists if not DATABASE_NAME in self.sqlConn.getDatabaseList(): self.sqlConn.createDatabase(DATABASE_NAME) db = self.sqlConn.getDatabase(DATABASE_NAME) # create tables if not exists if not SYMBOL_DETAIL_TABLE_NAME in db.getTableList(): db.createTable(SYMBOL_DETAIL_TABLE_NAME, SYMBOL_DETAIL_TABLE) db.commit() self.sqlConn.close()
def updateHistData(symbols: list): connector = MySQLConnector(LOGIN_INFO_PATH) sqlConn = connector.connect() saveCount = 0 for i in range(len(symbols)): print("updating... {}/{}".format(i + 1, len(symbols))) symbol = symbols[i] yf = yahoofinance.YFData(symbol) updateHistPrice(sqlConn, yf, symbol) updateDividend(sqlConn, yf, symbol) updateStocksplit(sqlConn, yf, symbol) checkAdjclose(sqlConn, yf, symbol) saveCount += 1 if saveCount == 100: sqlConn.commit() saveCount = 0 sqlConn.commit() sqlConn.close()
def getSymbolList() -> list: connector = MySQLConnector(LOGIN_INFO_PATH) sqlConn = connector.connect() sql = """ SELECT `symbols`.`symbols`.`symbol`, `symbols`.`symbols`.`enable`, `symbols`.`symbol_details`.`marketCap` FROM `symbols`.`symbols` INNER JOIN `symbols`.`symbol_details` ON `symbols`.`symbols`.`symbol`=`symbols`.`symbol_details`.`symbol` WHERE `symbols`.`symbols`.`enable` = 1 and `symbols`.`symbol_details`.`marketCap` > 0; """ data = sqlConn.execute(sql) symbols = data["symbol"] sql = """ SELECT `symbol` FROM `symbols`.`symbol_details` WHERE `quoteType` = "INDEX"; """ data = sqlConn.execute(sql) symbols.extend(data["symbol"]) sqlConn.close() return symbols
class Analyzer: def __init__(self, date): self.date = date self.connector = MySQLConnector() self.logger = logging.getLogger('analyzer') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.") def get_sentiment_score(self, title, text): sent_analyzer = SentimentIntensityAnalyzer() title_score = sent_analyzer.polarity_scores(title)['compound'] title_score = int(round((title_score + 1) * 10)) sents = sent_tokenize(text) scores = [] for sent in sents: if (len(word_tokenize(sent)) > 1): # 句子中单词大于1个 score = sent_analyzer.polarity_scores(sent)['compound'] scores.append(score) text_score = np.mean(scores) text_score = int(round((text_score + 1) * 40)) final_score = title_score + text_score return final_score def preprocess_text(self, text): text = text.replace("'", "''") return text def get_summary(self, title, text): article = Article(url='') article.title = title article.text = text article.download_state = ArticleDownloadState.SUCCESS article.is_parsed = True article.nlp() return self.preprocess_text(article.summary) def analyze(self): # load articles cursor = self.connector.connect() sql = "SELECT articleIndex, title, text FROM news.article WHERE downloadDate=%s and groupIndex is not null" % self.date # sql = "SELECT articleIndex, title, text FROM news.article WHERE groupIndex is not null" try: cursor.execute(sql) self.articles = cursor.fetchall() # print(self.articles) except: self.logger.info("Error: unable to fecth data") for article in self.articles: # generate summary summary = self.get_summary(article[1], article[2]) # sentiment analysis sentiment_score = self.get_sentiment_score(article[1], article[2]) # print(article[0] + '\t' + article[1] +'\t'+ str(sentiment_score)) # upload to DB sql = "UPDATE news.article SET sentimentScore='%d',summary='%s' WHERE articleIndex = '%s'" % ( sentiment_score, summary, article[0]) #sql = "UPDATE news.article SET sentimentScore='%d' WHERE articleIndex = '%s'" % (sentiment_score, article[0]) try: cursor.execute(sql) self.connector.db.commit() self.logger.info( "Successfully upload article %s with score=%d", article[0], sentiment_score) except: self.logger.info("Unable to update summary and score!") self.connector.db.rollback() self.connector.disconnect() # analyzer = Analyzer(date='20200801') # analyzer.analyze()
class Crawler: def __init__(self, last_date=None, this_date=None): self.connector = MySQLConnector() self.logger = logging.getLogger('crawler') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 输出到console的log等级的开关 formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) # load media list if last_date == None: last_date = self.get_last_date() self.last_date = datetime.datetime.strptime(last_date, '%Y%m%d').replace(tzinfo=pytz.utc) if this_date != None: self.today = this_date else: self.today = time.strftime("%Y%m%d", time.localtime()) self.logger.info("Finished initialization.") def if_covid19_related(self, text): ''' 判断文章是否包含疫情关键词 ''' text_lower = text.lower() for kw in covid_19_keywords: if kw in text_lower: return True return False def if_url_satisfied(self, media, url): flag = False for must in media['Must']: if must in url: flag = True if not flag: return False if media['MustNot'] != None: for mustnot in media['MustNot']: if mustnot in url: return False return True def preprocess_text(self, text): text = text.replace("'", "''") return text def crawl(self): ''' 爬新的文章(上次抓取时间之后发布的) ''' self.cursor = self.connector.connect() cur_count = self.get_cur_count() for media in media_list: urls = media['URL'] for idx, url in enumerate(urls): self.logger.info("Start crawl %s - %d ..." % (media['Name'], idx)) media_website = newspaper.build(url,language='en', memoize_articles=True) sum = len(media_website.articles) self.logger.info("Found %d articles" % sum ) for i, article in enumerate(media_website.articles): print(i, article.url) if not self.if_url_satisfied(media, article.url): # 网址前缀必须符合给定分类网址 continue # print(article.url) try: article.download() print("...download") article.parse() print("...parsed") except: continue # 文章过老 if article.publish_date == None: article.publish_date = datetime.datetime.now() article.publish_date = article.publish_date.replace(tzinfo=pytz.utc) if article.publish_date < self.last_date: continue # 文章与疫情无关 if not (self.if_covid19_related(article.title) or self.if_covid19_related(article.text)): continue # 上传到MySQL数据库 # article.nlp() self.upload_article(cur_count, media['Index'], article) self.logger.info("Successfuly download %d / %d th article %s \n %s" % (i, sum, article.url, article.title)) cur_count += 1 self.connector.disconnect() def upload_article(self, count, media_index, article): #self.cursor = self.connector.connect() idArticle = '%s%05d' % (self.today, count) title = self.preprocess_text(article.title) url = self.preprocess_text(article.url) # keywords = str(article.keywords).replace("'", '"') # summary = self.preprocess_text(article.summary) text = self.preprocess_text(article.text) sql = """INSERT INTO `news`.`article` (`articleIndex`, `title`, `mediaIndex`, `publishDate`, `downloadDate`, `image`,`url`, `text`) VALUES ('%s', '%s', '%d', '%s','%s','%s', '%s', '%s'); """ % ( idArticle, title, media_index, article.publish_date.strftime("%Y-%m-%d %H:%M:%S"), self.today, article.top_image, url, text) try: self.cursor.execute(sql) self.connector.db.commit() except MySQLdb._exceptions.OperationalError: self.connector.db.rollback() self.logger.info("Unable to upload article to DB!") # self.connector.disconnect() def get_cur_count(self): # self.cursor = self.connector.connect() sql = '''select MAX(articleIndex) from news.article where articleIndex REGEXP '%s';''' % (self.today) try: self.cursor.execute(sql) max_idx = self.cursor.fetchall() except: self.logger.info("Unable to get current count!") #print(max_idx) #self.connector.disconnect() if max_idx[0][0] is None: count = 0 else: count = int(max_idx[0][0][-5:]) + 1 self.logger.info("Successfully get current count = %d ." % count) return count def get_last_date(self): self.cursor = self.connector.connect() sql = '''SELECT max(downloadDate) FROM news.article''' try: self.cursor.execute(sql) fetch_result = self.cursor.fetchall() except: self.logger.info("Unable to get last date!") self.connector.disconnect() if fetch_result[0][0] is None: last_date = "20200101" else: last_date = datetime.date.strftime(fetch_result[0][0],'%Y%m%d') print(last_date) self.logger.info("Successfully get lastdate '%s'." % last_date) return last_date # crawler = Crawler(last_date = '20200708', this_date='20200709') # crawler = Crawler() #crawler.crawl()
class Cluster(): def __init__(self, date): self.date = date self.connector = MySQLConnector() self.logger = logging.getLogger('cluster') self.logger.setLevel(level=logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) # 输出到console的log等级的开关 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) self.logger.addHandler(ch) self.logger.info("Finished initialization.") def remove_punctuation(self, text): text = re.sub(r'[{}]+'.format(punctuation), '', text) return text.strip().lower() def load_articles(self): '''从数据库中获取未分类的数据''' cursor = self.connector.connect() sql = "SELECT articleIndex, title, text, mediaIndex FROM news.article WHERE downloadDate='%s'" % ( self.date) # sql = '''SELECT articleIndex, title, text, mediaIndex FROM news.article where downloadDate in ('20200728', '20200729', '20200731', '20200801')''' try: cursor.execute(sql) self.articles = cursor.fetchall() self.logger.info("Successfully loaded %d articles." % len(self.articles)) except: self.logger.info("Error: unable to fecth data") self.connector.disconnect() def remove_useless_articles(self): self.load_articles() useless_articles = [] media_articles = {} for article in self.articles: media = article[3] if media not in media_articles: media_articles[media] = set() title = article[1] if title in media_articles[media]: useless_articles.append(article[0]) #去重 self.logger.info("Found repeated article %s" % (article[0])) continue else: media_articles[media].add(title) media_articles[article[3]].add(article[1]) if detect(article[1]) != 'en': # print(article[1]) useless_articles.append(article[0]) self.logger.info("Found article %s not in English" % (article[0])) else: list_words = word_tokenize(self.remove_punctuation(article[2])) if len(list_words) < 30: useless_articles.append(article[0]) self.logger.info("Found article %s too short" % (article[0])) # print(useless_articles) cursor = self.connector.connect() for articleID in useless_articles: sql = "DELETE FROM `news`.`article` WHERE(`articleIndex` = %s)" % ( articleID) try: cursor.execute(sql) self.connector.db.commit() except: self.connector.db.rollback() self.logger.info("Unable to delete useless articles") self.connector.disconnect() self.logger.info("Successfully removed %d articles" % len(useless_articles)) def encode_text(self, encode_obj='title', vector='tf-idf'): ''' 多种方式编码文本 ''' if vector == 'count': vectorizer = CountVectorizer(stop_words='english', lowercase=True, binary=True, tokenizer=myTokenizer) elif vector == 'tf-idf': vectorizer = TfidfVectorizer(stop_words='english', lowercase=True, tokenizer=myTokenizer) corpus = [] if encode_obj == 'title': # 标题 idx = 1 # elif encode_obj == 'summary': # 摘要 # idx = 4 elif encode_obj == 'text': # 全文 idx = 2 for article in self.articles: corpus.append(article[idx]) X = vectorizer.fit_transform(corpus) #print(len(vectorizer.get_feature_names())) # print (vectorizer.get_feature_names()) return X def cluster(self): X = self.encode_text(encode_obj='text', vector='tf-idf') db_model = DBSCAN(eps=0.4, min_samples=3, metric='cosine').fit(X) joblib.dump(db_model, './models/model_%s.pkl' % self.date) self.group_list = db_model.labels_ # 打印分组结果 self.logger.info("Cluster results: %d groups " % np.max(self.group_list)) # for i in range(len(self.group_list)): # if self.group_list[i] != -1: # print(str(self.group_list[i])+'\t'+str(self.articles[i][3])+'\t'+self.articles[i][1]) group_result = {} for i, label in enumerate(self.group_list): if label not in group_result: group_result[label] = [] group_result[label].append(self.articles[i][1]) for i in range(0, np.max(self.group_list) + 1): print(i) print(group_result[i]) def upload_groups_to_DB(self): cursor = self.connector.connect() for i, label in enumerate(self.group_list): if label == -1: continue sql = "UPDATE news.article SET groupIndex='%s%03d' WHERE articleIndex = '%s'" % ( self.date, label, self.articles[i][0]) #print(sql) try: cursor.execute(sql) self.connector.db.commit() except: # 发生错误时回滚 self.logger.info("Unable to update group index!") self.connector.db.rollback() self.connector.disconnect() self.logger.info("Successfully update group indices.")
class Uploader: def __init__(self, date): self.date = date self.articles = [] self.connector = MySQLConnector() leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da") logging.basicConfig(level=logging.DEBUG) def upload_to_old_groups(self): '''如果新下载的新闻被分类到旧的新闻组,直接上传''' pass def upload_new_groups(self): '''上传新的新闻组''' cursor = self.connector.connect() sql = "SELECT articleIndex,groupIndex,title,publishDate,image,url,mediaObjId,sentimentScore,summary \ FROM news.article INNER JOIN news.media ON article.mediaIndex = media.index \ where groupIndex is REGEXP '%s'" % self.date try: cursor.execute(sql) articles = cursor.fetchall() # print(self.articles) except: print("Error: unable to fecth data") self.connector.disconnect() self.groups = {} for article in articles: groupIndex = article[1] if groupIndex not in self.groups: self.groups[groupIndex] = [] self.groups[groupIndex].append(article) for groupIndex in self.groups: # upload articles article_obj_ids = [] group_img = None for article in self.groups[groupIndex]: Article = leancloud.Object.extend('Article') Media = leancloud.Object.extend('Media') article_obj = Article() article_obj.set('ArticleIndex', article[0]) article_obj.set('GroupIndex', article[1]) article_obj.set('Title', article[2]) article_obj.set('Date', article[3]) if article[4] != '' : article_obj.set('ImageURL', article[4]) group_img = article[4] article_obj.set('Link', article[5]) article_obj.set('Media', Media.create_without_data(article[6])) # Pointer article_obj.set('SentimentScore', article[7]) article_obj.set('Summary', article[8]) article_obj.save() article_obj_ids.append(article_obj.id) # upload group info rank_score = self.cal_group_rank_score(self.groups[groupIndex]) # print(groupIndex + '\t' + str(rank_score)) # print(groupIndex, rank_score) NewsGroup = leancloud.Object.extend('NewsGroup') group_obj = NewsGroup() group_obj.set('Title', self.groups[groupIndex][0][2]) # 第一篇文章的title作为group title group_obj.set('GroupIndex', groupIndex) group_obj.set('RankScore', rank_score) if group_img is not None: group_obj.set('ImageURL', group_img) group_obj.set('Articles',(article_obj_ids)) group_obj.save() def cal_group_rank_score(self, articles): article_num = len(articles) if article_num == 3: num_score = 2 elif 4 <= article_num <= 8: num_score = 3 else: num_score = 1 senti_scores = [] media_set = set() for article in articles: senti_scores.append(article[7]) media_set.add(article[6]) media_num = len(media_set) if media_num == 1: media_score = 0 elif media_num == 2: media_score = 1 elif media_num <= 4: media_score = 2 else: media_score = 3 senti_diff = int(np.max(senti_scores) - np.min(senti_scores)) return num_score + media_score + senti_diff