def connect_to_mysql():
     conn = MySQLConnector()
     conn.open(self.mysql_connection['host'],
               self.mysql_connection['name'],
               self.mysql_connection['user'],
               self.mysql_connection['pass'])
     return conn
Пример #2
0
 def connect_to_mysql():
     conn = MySQLConnector()
     conn.open(self.mysql_connection['host'],
               self.mysql_connection['name'],
               self.mysql_connection['user'],
               self.mysql_connection['pass'])
     return conn
Пример #3
0
def OutterChecker(predictGroups, SD, ED, HRZID, Code, downGasAmount,
                  upGasAmount):
    # 基于起始和终止日期,建立合适的时间索引和DataFrame
    validIndex = pd.date_range(SD, ED, freq='1min')
    validDataFrame = pd.DataFrame(index=validIndex)
    # 创建数据库连接
    conn = MySQLConnector()
    conn.openConnector()
    # 查询该控制周期内燃气流量序列
    SQL_Gas = u'SELECT create_time,瞬时流量 FROM bd_xinkou_2 WHERE create_time>"%s" and create_time<"%s"' % (
        SD, ED)
    conn.cursor.execute(SQL_Gas)
    realGasSeries = np.array(conn.cursor.fetchall())
    if len(realGasSeries) == 0:
        return 1, [1], np.random.randn(len(validIndex)), np.random.randn(
            len(validIndex))
    realGasDataFrame = pd.DataFrame(np.array(realGasSeries[:, 1],
                                             dtype=np.float32),
                                    index=realGasSeries[:, 0],
                                    columns=[u'瞬时流量'])
    realGasDataFrame = realGasDataFrame.resample('1min').mean()
    validDataFrame[u'瞬时流量'] = realGasDataFrame[u'瞬时流量']
    realGasMean = realGasSeries[:, 1].mean()
    # 查询换热站一次回温度
    SQL_HFBT = u'SELECT create_time,一次回温度 FROM bd_xinkou_hrz WHERE project_sub_station_id=%d and code="%s" and create_time>"%s" and create_time<"%s"' % (
        HRZID, Code, SD, ED)
    conn.cursor.execute(SQL_HFBT)
    realHFBTArray = np.array(conn.cursor.fetchall())
    if len(realHFBTArray) == 0:
        return 1, [1], np.random.randn(len(validIndex)), np.random.randn(
            len(validIndex))
    realHFBTSeries = pd.Series(np.array(realHFBTArray[:, 1], dtype=np.float32),
                               index=realHFBTArray[:, 0],
                               name=u'一次回温度')
    realHFBTSeries = realHFBTSeries.resample('1min').mean()
    validDataFrame[u'一次回温度'] = realHFBTSeries
    # 确定预测组的恰当索引
    try:
        assert realGasMean > downGasAmount
        assert realGasMean < upGasAmount
        predictGroupsIndex = realGasMean // 100 - downGasAmount // 100
    except AssertionError:
        if realGasMean <= downGasAmount:
            predictGroupsIndex = 0
        else:
            predictGroupsIndex = -1
    targetPredict = predictGroups[predictGroupsIndex]
    validDataFrame[u'预测回水温度'] = targetPredict
    validDataFrame = validDataFrame.fillna(method='ffill').fillna(
        method='bfill')
    score = evalueHFBT(validDataFrame[u'瞬时流量'].as_matrix().ravel(),
                       validDataFrame[u'一次回温度'].as_matrix().ravel(),
                       validDataFrame[u'预测回水温度'].as_matrix().ravel())
    GasDis = [1]
    print u'在该控制周期内实际耗费燃气量为:%.1f' % realGasMean
    return score, GasDis, validDataFrame[u'一次回温度'].as_matrix().ravel(
    ), validDataFrame[u'预测回水温度'].as_matrix().ravel()
Пример #4
0
 def configSQL(self):
     connector = MySQLConnector(LOGIN_INFO_PATH)
     self.sqlConn = connector.connect()
     # create database if not exists
     if not DATABASE_NAME in self.sqlConn.getDatabaseList():
         self.sqlConn.createDatabase(DATABASE_NAME)
     db = self.sqlConn.getDatabase(DATABASE_NAME)
     # create tables if not exists
     if not SYMBOL_DETAIL_TABLE_NAME in db.getTableList():
         db.createTable(SYMBOL_DETAIL_TABLE_NAME, SYMBOL_DETAIL_TABLE)
     db.commit()
     self.sqlConn.close()
Пример #5
0
    def __init__(self, date):
        self.date = date
        self.connector = MySQLConnector()

        self.logger = logging.getLogger('analyzer')
        self.logger.setLevel(level=logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)
        self.logger.info("Finished initialization.")
Пример #6
0
def updateHistData(symbols: list):
    connector = MySQLConnector(LOGIN_INFO_PATH)
    sqlConn = connector.connect()
    saveCount = 0
    for i in range(len(symbols)):
        print("updating... {}/{}".format(i + 1, len(symbols)))
        symbol = symbols[i]
        yf = yahoofinance.YFData(symbol)
        updateHistPrice(sqlConn, yf, symbol)
        updateDividend(sqlConn, yf, symbol)
        updateStocksplit(sqlConn, yf, symbol)
        checkAdjclose(sqlConn, yf, symbol)
        saveCount += 1
        if saveCount == 100:
            sqlConn.commit()
            saveCount = 0
    sqlConn.commit()
    sqlConn.close()
Пример #7
0
def getSymbolList() -> list:
    connector = MySQLConnector(LOGIN_INFO_PATH)
    sqlConn = connector.connect()
    sql = """
    SELECT `symbols`.`symbols`.`symbol`, `symbols`.`symbols`.`enable`, `symbols`.`symbol_details`.`marketCap`
    FROM `symbols`.`symbols`
    INNER JOIN `symbols`.`symbol_details` ON `symbols`.`symbols`.`symbol`=`symbols`.`symbol_details`.`symbol`
    WHERE `symbols`.`symbols`.`enable` = 1 and `symbols`.`symbol_details`.`marketCap` > 0;
    """
    data = sqlConn.execute(sql)
    symbols = data["symbol"]
    sql = """
    SELECT `symbol` FROM `symbols`.`symbol_details`
    WHERE `quoteType` = "INDEX";
    """
    data = sqlConn.execute(sql)
    symbols.extend(data["symbol"])
    sqlConn.close()
    return symbols
Пример #8
0
    def __init__(self, last_date=None, this_date=None):
        self.connector = MySQLConnector()
        self.logger = logging.getLogger('crawler')
        self.logger.setLevel(level=logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)  # 输出到console的log等级的开关
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)

        # load media list
        if last_date == None:
            last_date = self.get_last_date()
        self.last_date = datetime.datetime.strptime(last_date, '%Y%m%d').replace(tzinfo=pytz.utc)


        if this_date != None:
            self.today = this_date
        else:
            self.today = time.strftime("%Y%m%d", time.localtime())

        self.logger.info("Finished initialization.")
Пример #9
0
 def __init__(self, date):
     self.date = datetime.datetime.strptime(date, '%Y%m%d')
     self.connector = MySQLConnector()
     leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz",
                    "Jw2Y6KFFsjI5kEz1qYqQ62da")
     logging.basicConfig(level=logging.DEBUG)
     self.logger = logging.getLogger('analyzer')
     self.logger.setLevel(level=logging.DEBUG)
     ch = logging.StreamHandler()
     ch.setLevel(logging.DEBUG)
     formatter = logging.Formatter(
         '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     ch.setFormatter(formatter)
     self.logger.addHandler(ch)
     self.logger.info("Finished initialization.")
Пример #10
0
def save_to_trips(registers):
    """
    Saves a list of registers to local>taxi>trips
    
    Input:
        registers: list of suitable dictionaries
    
    Output:
        None
    """
    # Create MySQL connection
    cnx = MySQLConnector()
    
    # Insert registers into trips table
    for reg in registers:
            cnx.insert(reg)
    # Close connection
    cnx.close()
Пример #11
0
class Analyzer:
    def __init__(self, date):
        self.date = date
        self.connector = MySQLConnector()

        self.logger = logging.getLogger('analyzer')
        self.logger.setLevel(level=logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)
        self.logger.info("Finished initialization.")

    def get_sentiment_score(self, title, text):

        sent_analyzer = SentimentIntensityAnalyzer()
        title_score = sent_analyzer.polarity_scores(title)['compound']
        title_score = int(round((title_score + 1) * 10))
        sents = sent_tokenize(text)
        scores = []
        for sent in sents:
            if (len(word_tokenize(sent)) > 1):  # 句子中单词大于1个
                score = sent_analyzer.polarity_scores(sent)['compound']
                scores.append(score)
        text_score = np.mean(scores)
        text_score = int(round((text_score + 1) * 40))
        final_score = title_score + text_score
        return final_score

    def preprocess_text(self, text):
        text = text.replace("'", "''")
        return text

    def get_summary(self, title, text):
        article = Article(url='')
        article.title = title
        article.text = text
        article.download_state = ArticleDownloadState.SUCCESS
        article.is_parsed = True
        article.nlp()
        return self.preprocess_text(article.summary)

    def analyze(self):
        # load articles
        cursor = self.connector.connect()
        sql = "SELECT articleIndex, title, text FROM news.article WHERE downloadDate=%s and groupIndex is not null" % self.date
        # sql = "SELECT articleIndex, title, text FROM news.article WHERE groupIndex is not null"
        try:
            cursor.execute(sql)
            self.articles = cursor.fetchall()
            # print(self.articles)
        except:
            self.logger.info("Error: unable to fecth data")

        for article in self.articles:
            # generate summary
            summary = self.get_summary(article[1], article[2])
            # sentiment analysis
            sentiment_score = self.get_sentiment_score(article[1], article[2])
            # print(article[0] + '\t' + article[1] +'\t'+ str(sentiment_score))
            # upload to DB
            sql = "UPDATE news.article SET sentimentScore='%d',summary='%s' WHERE articleIndex = '%s'" % (
                sentiment_score, summary, article[0])
            #sql = "UPDATE news.article SET sentimentScore='%d' WHERE articleIndex = '%s'" % (sentiment_score, article[0])

            try:
                cursor.execute(sql)
                self.connector.db.commit()
                self.logger.info(
                    "Successfully upload article %s with score=%d", article[0],
                    sentiment_score)
            except:
                self.logger.info("Unable to update summary and score!")
                self.connector.db.rollback()
        self.connector.disconnect()


# analyzer = Analyzer(date='20200801')
# analyzer.analyze()
Пример #12
0
class Crawler:
    def __init__(self, last_date=None, this_date=None):
        self.connector = MySQLConnector()
        self.logger = logging.getLogger('crawler')
        self.logger.setLevel(level=logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)  # 输出到console的log等级的开关
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)

        # load media list
        if last_date == None:
            last_date = self.get_last_date()
        self.last_date = datetime.datetime.strptime(last_date, '%Y%m%d').replace(tzinfo=pytz.utc)


        if this_date != None:
            self.today = this_date
        else:
            self.today = time.strftime("%Y%m%d", time.localtime())

        self.logger.info("Finished initialization.")



    def if_covid19_related(self, text):
        '''
        判断文章是否包含疫情关键词
        '''
        text_lower = text.lower()
        for kw in covid_19_keywords:
            if kw in text_lower:
                return True
        return False

    def if_url_satisfied(self, media, url):
        flag = False
        for must in media['Must']:
            if must in url:
                flag = True
        if not flag:
            return False

        if media['MustNot'] != None:
            for mustnot in media['MustNot']:
                if mustnot in url:
                    return False
        return True

    def preprocess_text(self, text):
        text = text.replace("'", "''")
        return text

    def crawl(self):
        '''
        爬新的文章(上次抓取时间之后发布的)
        '''
        self.cursor = self.connector.connect()
        cur_count = self.get_cur_count()
        for media in media_list:
            urls = media['URL']
            for idx, url in enumerate(urls):
                self.logger.info("Start crawl %s - %d  ..." % (media['Name'], idx))
                media_website = newspaper.build(url,language='en', memoize_articles=True)
                sum = len(media_website.articles)
                self.logger.info("Found %d articles" % sum )
                for i, article in enumerate(media_website.articles):
                    print(i, article.url)
                    if not self.if_url_satisfied(media, article.url):   # 网址前缀必须符合给定分类网址
                        continue
                    # print(article.url)
                    try:
                        article.download()
                        print("...download")
                        article.parse()
                        print("...parsed")
                    except:
                        continue

                    # 文章过老
                    if article.publish_date == None:
                        article.publish_date = datetime.datetime.now()

                    article.publish_date = article.publish_date.replace(tzinfo=pytz.utc)
                    if article.publish_date < self.last_date:
                        continue

                    # 文章与疫情无关
                    if not (self.if_covid19_related(article.title) or self.if_covid19_related(article.text)):
                        continue

                    # 上传到MySQL数据库
                    # article.nlp()

                    self.upload_article(cur_count, media['Index'], article)
                    self.logger.info("Successfuly download %d / %d th article %s \n %s" % (i, sum, article.url, article.title))
                    cur_count += 1

        self.connector.disconnect()

    def upload_article(self, count, media_index, article):
        #self.cursor = self.connector.connect()
        idArticle = '%s%05d' % (self.today, count)
        title = self.preprocess_text(article.title)
        url = self.preprocess_text(article.url)
        # keywords = str(article.keywords).replace("'", '"')
        # summary = self.preprocess_text(article.summary)
        text = self.preprocess_text(article.text)
        sql = """INSERT INTO `news`.`article` (`articleIndex`, `title`, `mediaIndex`, `publishDate`, `downloadDate`, `image`,`url`, `text`) 
                VALUES ('%s', '%s', '%d', '%s','%s','%s', '%s', '%s');
                """ % (
        idArticle, title, media_index, article.publish_date.strftime("%Y-%m-%d %H:%M:%S"), self.today, article.top_image,  url, text)
        try:
            self.cursor.execute(sql)
            self.connector.db.commit()

        except MySQLdb._exceptions.OperationalError:
            self.connector.db.rollback()
            self.logger.info("Unable to upload article to DB!")
        # self.connector.disconnect()


    def get_cur_count(self):
        # self.cursor = self.connector.connect()
        sql = '''select MAX(articleIndex) 
                from news.article 
              where articleIndex REGEXP '%s';''' % (self.today)
        try:
            self.cursor.execute(sql)
            max_idx = self.cursor.fetchall()
        except:
            self.logger.info("Unable to get current count!")
        #print(max_idx)
        #self.connector.disconnect()
        if max_idx[0][0] is None:
            count = 0
        else:
            count = int(max_idx[0][0][-5:]) + 1
        self.logger.info("Successfully get current count = %d ." % count)
        return count

    def get_last_date(self):
        self.cursor = self.connector.connect()
        sql = '''SELECT max(downloadDate) FROM news.article'''
        try:
            self.cursor.execute(sql)
            fetch_result = self.cursor.fetchall()
        except:
            self.logger.info("Unable to get last date!")
        self.connector.disconnect()
        if fetch_result[0][0] is None:
            last_date = "20200101"
        else:
            last_date = datetime.date.strftime(fetch_result[0][0],'%Y%m%d')
        print(last_date)
        self.logger.info("Successfully get lastdate '%s'." % last_date)
        return last_date

# crawler = Crawler(last_date = '20200708', this_date='20200709')
# crawler = Crawler()
#crawler.crawl()
Пример #13
0
class Uploader:
    def __init__(self, date):
        self.date = date
        self.articles = []
        self.connector = MySQLConnector()
        leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da")
        logging.basicConfig(level=logging.DEBUG)

    def upload_to_old_groups(self):
        '''如果新下载的新闻被分类到旧的新闻组,直接上传'''
        pass

    def upload_new_groups(self):
        '''上传新的新闻组'''
        cursor = self.connector.connect()
        sql = "SELECT articleIndex,groupIndex,title,publishDate,image,url,mediaObjId,sentimentScore,summary \
                FROM news.article INNER JOIN news.media ON article.mediaIndex = media.index     \
                where groupIndex is REGEXP '%s'"  % self.date
        try:
            cursor.execute(sql)
            articles = cursor.fetchall()
            # print(self.articles)
        except:
            print("Error: unable to fecth data")
        self.connector.disconnect()

        self.groups = {}
        for article in articles:
            groupIndex = article[1]
            if groupIndex not in self.groups:
                self.groups[groupIndex] = []
            self.groups[groupIndex].append(article)

        for groupIndex in self.groups:
            # upload articles
            article_obj_ids = []
            group_img = None
            for article in self.groups[groupIndex]:
                Article = leancloud.Object.extend('Article')
                Media = leancloud.Object.extend('Media')
                article_obj = Article()
                article_obj.set('ArticleIndex', article[0])
                article_obj.set('GroupIndex', article[1])
                article_obj.set('Title', article[2])
                article_obj.set('Date', article[3])
                if article[4] != '' :
                    article_obj.set('ImageURL', article[4])
                    group_img = article[4]
                article_obj.set('Link', article[5])
                article_obj.set('Media', Media.create_without_data(article[6])) # Pointer
                article_obj.set('SentimentScore', article[7])
                article_obj.set('Summary', article[8])
                article_obj.save()
                article_obj_ids.append(article_obj.id)
            # upload group info
            rank_score = self.cal_group_rank_score(self.groups[groupIndex])
            # print(groupIndex + '\t' + str(rank_score))
            # print(groupIndex, rank_score)
            NewsGroup = leancloud.Object.extend('NewsGroup')
            group_obj = NewsGroup()
            group_obj.set('Title', self.groups[groupIndex][0][2])       # 第一篇文章的title作为group title
            group_obj.set('GroupIndex', groupIndex)
            group_obj.set('RankScore', rank_score)
            if group_img is not None:
                group_obj.set('ImageURL', group_img)
            group_obj.set('Articles',(article_obj_ids))
            group_obj.save()

    def cal_group_rank_score(self, articles):
        article_num = len(articles)
        if article_num == 3:
            num_score = 2
        elif 4 <= article_num <= 8:
            num_score = 3
        else:
            num_score = 1

        senti_scores = []
        media_set = set()
        for article in articles:
            senti_scores.append(article[7])
            media_set.add(article[6])
        media_num = len(media_set)
        if media_num == 1:
            media_score = 0
        elif media_num == 2:
            media_score = 1
        elif media_num <= 4:
            media_score = 2
        else:
            media_score = 3

        senti_diff = int(np.max(senti_scores) - np.min(senti_scores))
        return num_score + media_score + senti_diff
Пример #14
0
 def readData(self):
     # 建立对数据库的连接
     conn = MySQLConnector()
     conn.openConnector()
     # ++++++++++++++ 查询锅炉状态 +++++++++++++++++++++++++++++++++++
     # 从SQL语句中分析所需要抓取的字段名模式
     columnsPattern = re.compile("SELECT DISTINCT (.*?) FROM")
     columnsStr = re.findall(columnsPattern, self._QueryBoilerState)[0]
     columns = columnsStr.strip().split(',')
     columns = [(c.strip())[2:] for c in columns]
     # 对字段名进行修正
     if len(self._BoilerColumnsMap()) != 0:
         columns = [self._BoilerColumnsMap()[c] for c in columns]
     # 游标执行查询锅炉状态数据的语句
     conn.cursor.execute(self._QueryBoilerState)
     tempBoilerState = np.array(conn.cursor.fetchall())
     #     对该数据集中日期时间进行修正
     modifiedTime = [tools_DateTimeTrans(i) for i in tempBoilerState[:, 0]]
     BoilerData = pd.DataFrame(np.array(tempBoilerState[:, 1:], np.float32),
                               index=modifiedTime,
                               columns=columns[1:])
     # ++++++++++++++ 查询锅炉状态 +++++++++++++++++++++++++++++++++++
     # ++++++++++++++ 查询天气状态 +++++++++++++++++++++++++++++++++++
     # 从SQL语句中分析所需要抓取的字段名模式
     columnsStr = re.findall(columnsPattern, self._QueryWeatherState)[0]
     columns = columnsStr.strip().split(',')
     columns = [c.strip()[2:] for c in columns]
     # 对字段名进行修正
     if len(self._WeatherColumnsMap()) != 0:
         columns = [self._WeatherColumnsMap()[c] for c in columns]
     # 游标执行查询天气状态数据的语句
     conn.cursor.execute(self._QueryWeatherState)
     tempWeatherState = np.array(conn.cursor.fetchall())
     modifiedTime = [tools_DateTimeTrans(i) for i in tempWeatherState[:, 0]]
     WeatherData = pd.DataFrame(np.array(tempWeatherState[:, 1:],
                                         np.float32),
                                index=modifiedTime,
                                columns=columns[1:])
     # ++++++++++++++ 查询天气状态 +++++++++++++++++++++++++++++++++++
     # ++++++++++++++ 查询室内状态 +++++++++++++++++++++++++++++++++++
     dev_ids = self._CollectInHomeDeviceID()
     dev_DataFrames = []
     for k, dev_id in enumerate(dev_ids):
         # 生成每一个设备对应的SQL语句
         individualDevSQL = self._QueryInHomeState % (dev_id)
         # 从SQL语句中抽取所抽取的字段
         columnsStr = re.findall(columnsPattern, individualDevSQL)[0]
         # 对字段名称进行修正
         columns = columnsStr.strip().split(',')
         columns = [c.strip()[2:] for c in columns]
         if len(self._InHomeColumnsMap()) != 0:
             columns = [self._InHomeColumnsMap()[c] for c in columns]
         columns = [u'%d#传感器%s' % (k, c) for c in columns]
         # 游标执行查询室内状态数据的语句
         conn.cursor.execute(individualDevSQL)
         tempInHomeState = np.array(conn.cursor.fetchall())
         try:
             modeifiedTime = [
                 tools_DateTimeTrans(i) for i in tempInHomeState[:, 0]
             ]
             InHomeData = pd.DataFrame(np.array(tempInHomeState[:, 1:],
                                                np.float32),
                                       index=modeifiedTime,
                                       columns=columns[1:])
             dev_DataFrames.append(InHomeData)
         except IndexError:
             print u'%d#室内传感器无数据' % k
             continue
     # ++++++++++++++ 查询室内状态 +++++++++++++++++++++++++++++++++++
     # ++++++++++++++ 查询换热站状态 +++++++++++++++++++++++++++++++++
     if self._QueryHRZState != False:
         # 从SQL语句中分析所需要抓取的字段名模式
         columnsStr = re.findall(columnsPattern, self._QueryHRZState)[0]
         columns = columnsStr.strip().split(',')
         columns = [c.strip()[2:] for c in columns]
         # 对字段名进行修正
         if len(self._HRZColumnsMap()) != 0:
             columns = [self._HRZColumnsMap()[c] for c in columns]
         # 游标执行查询天气状态数据的语句
         conn.cursor.execute(self._QueryHRZState)
         tempHRZState = np.array(conn.cursor.fetchall())
         modifiedTime = [tools_DateTimeTrans(i) for i in tempHRZState[:, 0]]
         HRZData = pd.DataFrame(np.array(tempHRZState[:, 1:], np.float32),
                                index=modifiedTime,
                                columns=columns[1:])
     else:
         HRZData = pd.DataFrame()
     totalData = self.concatData(BoilerData, WeatherData, dev_DataFrames,
                                 HRZData)
     return totalData
import argparse
from MySQLConnector import MySQLConnector

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-host', type=str, help='host ip/domain')
    parser.add_argument('-id', type=str, help='id')
    parser.add_argument('-db', type=str, help='Database name')
    parser.add_argument('-verbose',
                        action='store_true',
                        default=False,
                        help='verbose option')

    args = parser.parse_args()

    conn = MySQLConnector(args.host, args.id, args.db, verbose=args.verbose)

    while True:
        print(
            "===================================================================="
        )
        print(
            "q : query, qs : query with string, gs : get settings, p : print history, c : clear history"
        )
        print(
            "gh : get history, ghd : get history dict, ec : export csv, ep : export pickle, ri : remove invalid"
        )
        print("quit : quit")
        a = input("opcode > ")
        print(
            "--------------------------------------------------------------------"
 def open_dbconnection(self):
     """ Create and return a MySQL connection object """
     conn = MySQLConnector()
     conn.open(self.mysql_connection['host'], self.mysql_connection['name'],
               self.mysql_connection['user'], self.mysql_connection['pass'])
     return conn
Пример #17
0
 def __init__(self, date):
     self.date = date
     self.articles = []
     self.connector = MySQLConnector()
     leancloud.init("U83hlMObhFRFRS4kX3lOxSlq-gzGzoHsz", "Jw2Y6KFFsjI5kEz1qYqQ62da")
     logging.basicConfig(level=logging.DEBUG)
Пример #18
0
from MySQLConnector import MySQLConnector
if __name__=="__main__":
    print("Hello world!")
    print(MySQLConnector().connect())
Пример #19
0
class Cluster():
    def __init__(self, date):
        self.date = date
        self.connector = MySQLConnector()

        self.logger = logging.getLogger('cluster')
        self.logger.setLevel(level=logging.DEBUG)
        ch = logging.StreamHandler()
        ch.setLevel(logging.DEBUG)  # 输出到console的log等级的开关
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        ch.setFormatter(formatter)
        self.logger.addHandler(ch)
        self.logger.info("Finished initialization.")

    def remove_punctuation(self, text):
        text = re.sub(r'[{}]+'.format(punctuation), '', text)
        return text.strip().lower()

    def load_articles(self):
        '''从数据库中获取未分类的数据'''
        cursor = self.connector.connect()
        sql = "SELECT articleIndex, title, text, mediaIndex FROM news.article WHERE downloadDate='%s'" % (
            self.date)
        # sql = '''SELECT articleIndex, title, text, mediaIndex  FROM news.article where downloadDate in ('20200728', '20200729', '20200731', '20200801')'''
        try:
            cursor.execute(sql)
            self.articles = cursor.fetchall()
            self.logger.info("Successfully loaded %d articles." %
                             len(self.articles))
        except:
            self.logger.info("Error: unable to fecth data")
        self.connector.disconnect()

    def remove_useless_articles(self):
        self.load_articles()
        useless_articles = []
        media_articles = {}
        for article in self.articles:
            media = article[3]
            if media not in media_articles:
                media_articles[media] = set()
            title = article[1]
            if title in media_articles[media]:
                useless_articles.append(article[0])  #去重
                self.logger.info("Found repeated article %s" % (article[0]))
                continue
            else:
                media_articles[media].add(title)
            media_articles[article[3]].add(article[1])
            if detect(article[1]) != 'en':
                # print(article[1])
                useless_articles.append(article[0])
                self.logger.info("Found article %s not in English" %
                                 (article[0]))
            else:
                list_words = word_tokenize(self.remove_punctuation(article[2]))
                if len(list_words) < 30:
                    useless_articles.append(article[0])
                    self.logger.info("Found article %s too short" %
                                     (article[0]))

        # print(useless_articles)
        cursor = self.connector.connect()
        for articleID in useless_articles:
            sql = "DELETE FROM `news`.`article` WHERE(`articleIndex` = %s)" % (
                articleID)
            try:
                cursor.execute(sql)
                self.connector.db.commit()
            except:
                self.connector.db.rollback()
                self.logger.info("Unable to delete useless articles")
        self.connector.disconnect()
        self.logger.info("Successfully removed %d articles" %
                         len(useless_articles))

    def encode_text(self, encode_obj='title', vector='tf-idf'):
        '''
        多种方式编码文本
        '''
        if vector == 'count':
            vectorizer = CountVectorizer(stop_words='english',
                                         lowercase=True,
                                         binary=True,
                                         tokenizer=myTokenizer)
        elif vector == 'tf-idf':
            vectorizer = TfidfVectorizer(stop_words='english',
                                         lowercase=True,
                                         tokenizer=myTokenizer)
        corpus = []
        if encode_obj == 'title':  # 标题
            idx = 1
        # elif encode_obj == 'summary':   # 摘要
        #     idx = 4
        elif encode_obj == 'text':  # 全文
            idx = 2

        for article in self.articles:
            corpus.append(article[idx])

        X = vectorizer.fit_transform(corpus)
        #print(len(vectorizer.get_feature_names()))
        # print (vectorizer.get_feature_names())
        return X

    def cluster(self):
        X = self.encode_text(encode_obj='text', vector='tf-idf')
        db_model = DBSCAN(eps=0.4, min_samples=3, metric='cosine').fit(X)
        joblib.dump(db_model, './models/model_%s.pkl' % self.date)
        self.group_list = db_model.labels_

        # 打印分组结果
        self.logger.info("Cluster results: %d groups " %
                         np.max(self.group_list))
        # for i in range(len(self.group_list)):
        #     if self.group_list[i] != -1:
        #         print(str(self.group_list[i])+'\t'+str(self.articles[i][3])+'\t'+self.articles[i][1])
        group_result = {}
        for i, label in enumerate(self.group_list):
            if label not in group_result:
                group_result[label] = []
            group_result[label].append(self.articles[i][1])
        for i in range(0, np.max(self.group_list) + 1):
            print(i)
            print(group_result[i])

    def upload_groups_to_DB(self):
        cursor = self.connector.connect()
        for i, label in enumerate(self.group_list):
            if label == -1:
                continue
            sql = "UPDATE news.article SET groupIndex='%s%03d' WHERE articleIndex = '%s'" % (
                self.date, label, self.articles[i][0])
            #print(sql)
            try:
                cursor.execute(sql)
                self.connector.db.commit()
            except:
                # 发生错误时回滚
                self.logger.info("Unable to update group index!")
                self.connector.db.rollback()
        self.connector.disconnect()
        self.logger.info("Successfully update group indices.")