Exemplo n.º 1
0
    def get_summary_corpus(self, start_time, end_time):
        """
       获得划线模型语料
       :param start_time:
       :param end_time:
       :return:
       """
        sql = """
            SELECT bd.id, bd.publishtime, bd.site_name, nar.abstract  
            FROM news_abstract_result nar, base_data bd 
            WHERE nar.bd_id = bd.id 
            AND bd.publishtime >= '%s' AND bd.publishtime < '%s' 
            AND nar.language_type = 1 
            AND nar.is_lined = 1""" % (start_time, end_time)

        records = self.fetch_all(sql)

        data = []
        for row in records:
            id = row['id']

            abstract = row['abstract']
            abstract = abstract.encode('utf-8')

            publish_time = row['publishtime'].strftime(
                "%Y-%m-%d %H:%M:%S").decode('utf-8')
            site_name = row['site_name']
            site_name = site_name.encode('utf-8')

            cluster_obj = ClusterMessageObj(id, abstract, publish_time, '',
                                            site_name)
            data.append(cluster_obj)
        return data
Exemplo n.º 2
0
    def parse_corpus_records(records, language_type=None):
        data = []
        for row in records:
            id = row['id']

            title = row['title']
            title = title.encode('utf-8')

            content = row['content']
            content = content.encode('utf-8')

            publish_time = row['publishtime'].strftime(
                "%Y-%m-%d %H:%M:%S").decode('utf-8')
            site_name = row['site_name']
            site_name = site_name.encode('utf-8')

            cluster_obj = ClusterMessageObj(id, title, publish_time, content,
                                            site_name)

            # 进行语言过滤
            if language_type is not None:
                if BaseDataView.is_valid_language(language_type, content):
                    data.append(cluster_obj)
            else:
                data.append(cluster_obj)
        return data
Exemplo n.º 3
0
    def parse_corpus_records(records):
        data = []
        for row in records:
            id = row['id']

            title = row['title']
            title = title.encode('utf-8')

            content = row['content']
            content = content.encode('utf-8')

            publish_time = row['publishtime'].strftime(
                "%Y-%m-%d %H:%M:%S").decode('utf-8')
            site_name = row['site_name']
            site_name = site_name.encode('utf-8')

            cluster_obj = ClusterMessageObj(id, title, publish_time, content,
                                            site_name)
            data.append(cluster_obj)
        return data
Exemplo n.º 4
0
def query_string(querystring):
    # 请求uri前缀
    cMList = []
    prefix_req_uri = "http://saas1:5000/enterprise_saas_platform/saas_platform/" + querystring + "/"
    suffix_uri_total = "display=id/1/1"
    start_time = datetime.now()
    logger.info('starting query_string, {prefix_req_uri: %s}' %
                (prefix_req_uri, start_time.strftime('%Y-%m-%d %H:%M:%S')))

    # 获得总数
    totalUri = prefix_req_uri + suffix_uri_total
    totalResp = requests.get(totalUri)
    total = json.loads(totalResp.text)["total"]

    # 每次请求条数
    pageNum = 100

    # 遍历, 获得数据
    display_uri_data = "display=id&title&pubtime/"
    for idx in range(0, total, pageNum):
        page_uri_data = str(idx) + "/" + str(pageNum)
        dataUri = prefix_req_uri + display_uri_data + page_uri_data
        dataResp = requests.get(dataUri)

        # 遍历数据, 保存到list列表中
        for mObj in json.loads(dataResp.text)["doc"]:
            id = mObj["id"]
            title = mObj["title"]
            publishtime = mObj["pubtime"]
            cMList.append(
                ClusterMessageObj(messageId=id,
                                  messageTitle=title,
                                  messagePublishtime=publishtime))
    # 返回数据
    logger.info(
        'end query_string: {prefix_req_uri: %s, total: %d, lost_seconds: %ds}'
        % (prefix_req_uri, total, (datetime.now() - start_time).seconds))
    return cMList
Exemplo n.º 5
0
def dic_clusterobj(dic):
    msg = ClusterMessageObj()
    msg.__dict__ = dic
    return msg
Exemplo n.º 6
0
def get_involved_china_corpus(start_time, end_time, language_type, group_id):
    """
    获得聚类语料
    :param start_time:
    :param end_time:
    :param group_id:
    :return:
    """
    data = []
    start_timestamp = time.time()

    # SQL 查询语句, 内容只取第一段内容
    sql = """
    SELECT id, title, content, publishtime, site_name 
    FROM base_data_view 
    WHERE publishtime >= '%s' AND publishtime < '%s' 
    AND language_type = %s 
    AND group_id IN %s 
    AND involved_china = 1 
    group by titlehash """ % (start_time, end_time, language_type, group_id)

    logger.debug("starting get_involved_china_corpus, {sql: %s}." % sql)

    # 打开数据库连接  内网: 10.30.248.210    外网: 47.93.162.134
    # 47.93.162.134
    db = MySQLdb.Connection(host='10.30.248.210',
                            user='******',
                            passwd='Wi$eWeb123',
                            db='wjbdb',
                            charset='utf8',
                            port=5720)

    # 使用cursor()方法获取操作游标
    cursor = db.cursor()

    try:
        # 执行SQL语句
        cursor.execute(sql)

        # 获取所有记录列表
        for row in cursor:
            id = row[0]

            title = row[1]
            try:
                title = title.encode('utf-8')
            except Exception as e:
                logger.debug("Error: title.encode('utf-8'), {exception: %s}" %
                             e)

            content = row[2]
            try:
                content = content.encode('utf-8')
            except Exception as e:
                logger.debug(
                    "Error: content.encode('utf-8'), {exception: %s}" % e)

            publish_time = row[3].strftime("%Y-%m-%d %H:%M:%S").decode('utf-8')

            site_name = row[4]
            try:
                site_name = site_name.encode('utf-8')
            except Exception as e:
                logger.debug(
                    "Error: site_name.encode('utf-8'), {exception: %s}" % e)

            cluster_obj = ClusterMessageObj(id, title, publish_time, content,
                                            site_name)

            # 加入结果
            data.append(cluster_obj)

        logger.debug(
            "ending get_involved_china_corpus, {data length: %s, cost_times: %ds}"
            % (len(data), time.time() - start_timestamp))
    except Exception as e:
        logger.error("Error: get_involved_china_corpus, {exception: %s}" % e)
    finally:
        # 关闭数据库连接
        cursor.close()
        db.close()
        return data