def __init__(self, sub_group ='M03A'):
        #
        import Lily.ctao.hostmetadata as chmd
        import Lily.ctao.database as cdb
        self.sub_group      = sub_group
        self.hostmetadata   = chmd.hostmetadata()
        self.database       = cdb.database(self.hostmetadata.database)


        self.sub_warehouse  = '{0}/crawler_ETC_{1}'.format(self.hostmetadata.warehouse, self.sub_group)
        self.excel_filename    = '{0}/data_clawler_ETC_{1}_list.xlsx'.format(self.hostmetadata.warehouse , self.sub_group)
        self.sqlite_tablename  = 'data_crawler_ETC_{0}_list'.format(self.sub_group)
        self.sqlite_tablepull  = 'data_crawler_ETC_{0}_pull'.format(self.sub_group)

        #check/create if not exists directory
        if  not os.path.exists(self.sub_warehouse) :
            os.mkdir(self.sub_warehouse)

        #date regular expresstion YYYYMMDD
        date_YYYYMMDD_pattern = '''([12]\d{3}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))'''

        self.url = 'http://tisvcloud.freeway.gov.tw/history/TDCS/{0}/'.format(self.sub_group)   #
        self.cloud_archive_pattern = 'href=\"({0}_{1}\.tar\.gz)\"' .format(self.sub_group,  date_YYYYMMDD_pattern)
        self.local_archive_pattern = '({0}_{1}\.tar\.gz)' .format(self.sub_group , date_YYYYMMDD_pattern)
        self.check_archive_list()
Пример #2
0
def cwb_melt2():
    db = cdb.database('data_crawler.sqlite')
    df = db.to_dataframe('data_rdset_pylily_cwb_sensible_earthquake')

    station = []
    for ind, row in df.iterrows():

        for st in df.at[ind, 'Stations'].split(';'):
            if u'''地區最大震度''' not in st and st != '':
                rdset = [
                    df.at[ind, 'id'], df.at[ind, 'time'],
                    float(df.at[ind, 'px'][4:-2]),
                    float(df.at[ind, 'py'][4:-2]),
                    float(df.at[ind, 'depth'][:-3]),
                    float(df.at[ind, 'ML']), df.at[ind, 'Location'],
                    ''.join(st.split('\u3000')[:-1]),
                    float(st.split('\u3000')[-1:][0])
                ]
                station.append(rdset)
    df2 = pandas.DataFrame(
        station, columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'])
    df2.to_sql(
        'data_rdset_pylily_cwb_sensible_earthquake_LocalSeismicIntensity',
        db.connect,
        if_exists='replace',
        index=False)
    return
def check_docx(docx_file_name):
    from Lily.ctao.database     import database 
    from Lily.ctao.nsgstring    import alnum
    from Lily.ctao.hostmetadata import hostmetadata
    from Lily.blacksmith.file_feature import get_feature

    host    = hostmetadata()
    db      = database(host.database)
    doc     = Document(docx_file_name)
    feature = get_feature(docx_file_name)

    excelfile = feature['path'] + '/' + feature['name'] + '.xlsx'
    tablename = (feature['name'] + '_{0}')
    writer    = pandas.ExcelWriter( excelfile , engine = 'xlsxwriter')

    counter = 1
    sheetlist = []
    for tab in doc.tables:
        data1=[]
        for row in tab.rows:
            data1.append( [cell.text for cell in row.cells] )

        df = pandas.DataFrame(data1)
        counter = counter + 1
        table_name = tablename.format( str(counter).zfill(3) )
        sheetlist.append(table_name)
        df.to_sql(table_name, db.connect, if_exists='replace')
        df.to_excel(writer, sheet_name=table_name)

    writer.save()
    writer.close()
    return sheetlist
Пример #4
0
    def __init__(self):
        self.ctaohost = hmd.hostmetadata()

        today = datetime.datetime.today()

        self.database_filename = self.ctaohost.warehouse + '/ctao_data_crawler_vehicledetect_{0}.sqlite'.format(
            today.strftime('%Y%m'))

        self.database = cdb.database(self.database_filename)

        self.sub_group = 'data_crawler_vd'

        self.dict_data = {
            'tpec_vddata': [
                'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVDDATA.xml.gz',
                '<ExchangeTime>(.*)</ExchangeTime>', '%Y/%m/%dT%H:%M:%S'
            ],
            'tpec_vd': [
                'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVD.xml.gz',
                '<vd:ExchangeTime>(.*)</vd:ExchangeTime>', '%Y/%m/%dT%H:%M:%S'
            ],
            'nfbx_1968': [
                'http://tisvcloud.freeway.gov.tw/xml/1min_incident_data_1968.xml',
                'time="([^"]*)"', '%Y-%m-%d %H:%M:%S'
            ],
            'nfbx_rlx1': [
                'http://tisvcloud.freeway.gov.tw/roadlevel_value.xml.gz',
                'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S'
            ],
            'nfbx_rlx5': [
                'http://tisvcloud.freeway.gov.tw/roadlevel_value5.xml.gz',
                'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S'
            ],
            'nfbx_vdx1': [
                'http://tisvcloud.freeway.gov.tw/vd_value.xml.gz',
                'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S'
            ],
            'nfbx_vdx5': [
                'http://tisvcloud.freeway.gov.tw/vd_value5.xml.gz',
                'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S'
            ]
        }

        #all opendata source
        self.list_df = pandas.DataFrame.from_dict(
            self.dict_data,
            orient='index',
            columns=[
                'url', 'exchange_time_repattern',
                'exchange_time_datetimepattern'
            ])

        self.list_df['gzip_context'] = numpy.random.bytes(1)
        self.list_df['download_datetime'] = numpy.datetime64(
            datetime.datetime.now())
        self.list_df['exchange_datetime'] = numpy.datetime64(
            datetime.datetime.now())
Пример #5
0
def cwb_melt1():
    import lzma
    db = cdb.database('data_crawler_cwb_earthquake_list.sqlite')

    sql = '''
        select max(rowid) rowid, id,  lzma_html from data_crawler_cwb_sensible_earthquake_download group by id
    '''
    df = pandas.read_sql(sql, db.connect, index_col=['rowid'])

    df = df.reindex(columns=[
        'id', 'lzma_html', 'time', 'py', 'px', 'depth', 'ML', 'Location',
        'Stations'
    ],
                    fill_value='')

    for ind, row in df.iterrows():
        #        print ('melt', row[0])
        html_tables = pandas.read_html(lzma.decompress(
            sqlite3.Binary(row['lzma_html'])),
                                       encoding='utf-8')
        arg2 = html_tables[2]
        df.at[ind, 'time'] = arg2.iat[0, 1]
        df.at[ind, 'py'] = arg2.iat[1, 1]
        df.at[ind, 'px'] = arg2.iat[2, 1]
        df.at[ind, 'depth'] = arg2.iat[3, 1]
        df.at[ind, 'ML'] = arg2.iat[4, 1]
        df.at[ind, 'Location'] = arg2.iat[5, 1]

        for ind2, row2 in html_tables[3].iterrows():
            if isinstance(row2, pandas.core.series.Series):
                for elem in row2:
                    if isinstance(elem, str):
                        df.at[ind,
                              'Stations'] = df.at[ind, 'Stations'] + ';' + elem
            else:
                if isinstance(elem, str):
                    df.at[ind,
                          'Stations'] = df.at[ind, 'Stations'] + ';' + row2

    df = df.drop(columns=['lzma_html'])
    df.to_sql('data_rdset_pylily_cwb_sensible_earthquake',
              db.connect,
              if_exists='append',
              index=False)

    db.connect.execute('''delete from {0} where rowid not in 
                         (select max (rowid) from {0} group by id)'''.format(
        'data_rdset_pylily_cwb_sensible_earthquake'))

    db.connect.commit()
    return
Пример #6
0
def cwb_crawler():

    import Lily.crawler.url_string as curlstr
    db = cdb.database('data_crawler_cwb_earthquake_list.sqlite')

    db_target_tab = 'data_crawler_cwb_sensible_earthquake_download'

    eq_dir_pa = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx?ItemId=20&Date={0}'''
    sub_url = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx{0}'''
    days = pandas.date_range(
        datetime.datetime.today() - datetime.timedelta(days=90),
        datetime.datetime.today() + datetime.timedelta(days=31),
        freq='M')

    for mon in days:

        eq_list = pandas.DataFrame(
            columns=['id', 'lzma_html', 'download_time'])
        time.sleep(90 / 1000.0)

        day_url = eq_dir_pa.format(mon.strftime('%Y%m'))
        cur1 = curlstr.url_string(day_url)
        arg1 = LH.fromstring(cur1.to_str())
        arg2 = arg1.xpath('//tr/td/a/@href')
        arg3 = {}

        for elem in arg2:
            if elem not in arg3 and elem[1:7] == 'ItemId':
                print('download html', elem)
                cur2 = curlstr.url_string(sub_url.format(elem))
                arg3[elem] = True
                eq_list.loc[len(eq_list)] = [
                    elem[22:],
                    cur2.to_lzma_xz(),
                    datetime.datetime.now()
                ]

        eq_list.to_sql(db_target_tab,
                       db.connect,
                       if_exists='append',
                       index=False)

    db.connect.execute('''delete from {0} where rowid not in 
                         (select max (rowid) from {0} group by id, lzma_html)'''
                       .format(db_target_tab))

    db.connect.commit()
    return
Пример #7
0
def to_database( target_dir ):
    import Lily.ctao.database as cdb
    import Lily.ctao.nsgstring as nstr
    import Lily.ctao.hostmetadata as chmd

    host  = chmd.hostmetadata()
    p1 = nstr.alnum(host.platform)
    h1 = nstr.alnum(host.hostname)
    d1 = nstr.alnum(target_dir)

    db    = cdb.database(host.database)

    dflist = get_all_filefeature_with_md5sum(target_dir)
    table_name = '''data_rdset_filemd5_{0}_{1}_hhot_{2}'''.format(p1, h1, d1) 

    dflist.to_sql(table_name, db.connect, if_exists='replace', index=False)
Пример #8
0
def cwb_melt1():
    import lzma
    db = cdb.database('data_crawler.sqlite')

    sql = '''
        select Id, routeId,  nameZh, nameEn, seqNo, pgp, longitude, showLon, showLat, vector from  group by id
    '''
    df = pandas.read_sql(sql, db.connect, index_col=['rowid'])

    df = df.reindex(columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'],
                    fill_value='')

    for ind, row in df.iterrows():
        #        print ('melt', row[0])
        json_tables = pandas.read_html(lzma.decompress(
            sqlite3.Binary(row['lzma_html'])),
                                       encoding='utf-8')
        arg2 = json_tables[2]
        df.at[ind, 'routeId'] = arg2.iat[0, 1]
        df.at[ind, 'nameZh'] = arg2.iat[1, 1]
        df.at[ind, 'nameEn'] = arg2.iat[2, 1]
        df.at[ind, 'seqNo'] = arg2.iat[3, 1]
        df.at[ind, 'pgp'] = arg2.iat[4, 1]
        df.at[ind, 'longitude'] = arg2.iat[5, 1]

        for ind2, row2 in json_tables[3].iterrows():
            if isinstance(row2, pandas.core.series.Series):
                for elem in row2:
                    if isinstance(elem, str):
                        df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + elem
            else:
                if isinstance(elem, str):
                    df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + row2

    #df = df.drop(columns=['lzma_html'])
    df.to_sql('data_rdset_pylily', db.connect, if_exists='append', index=False)

    db.connect.execute('''delete from {0} where rowid not in 
                         (select max (rowid) from {0} group by id)'''.format(
        'data_rdset_pylily'))

    db.connect.commit()
    return
Пример #9
0
def cwb_melt1():
    import lzma
    db = cdb.database('data_crawler.sqlite')

    sql = '''
        select Id, routeId,  nameZh, nameEn, seqNo, pgp, longitude, showLon, showLat, vector from  'd:/0702/臺北市站牌.gz' group by id
    '''
    df = pandas.read_sql(sql, db.connect , index_col=['routeId'])

    df = df.reindex ( columns=[ 'Id','routeId','nameZh', 'nameEn', 'seqNo', 'pgp', 'longitude', 'showLon','showLat','vector' ], fill_value=''  )

    for ind, row in df.iterrows():
#        print ('melt', row[0])
        arg2 = json_tables[0]
        df.at[ind,'routeId'] = arg2.iat[0,1] 
        df.at[ind,'nameZh'] = arg2.iat[1,1]
        df.at[ind,'nameEn'] = arg2.iat[2,1]
        df.at[ind,'seqNo'] = arg2.iat[3,1]
        df.at[ind,'pgp'] = arg2.iat[4,1]
        df.at[ind,'longitude'] = arg2.iat[5,1]
        df.at[ind,'showLon'] = arg2.iat[6,1]
        df.at[ind,'showLat'] = arg2.iat[7,1]
        df.at[ind,'vector'] = arg2.iat[8,1]

        for ind2, row2 in json_tables[3].iterrows():
            if isinstance(row2, pandas.core.series.Series):
                for elem in row2:
                    if isinstance(elem,str):
                        df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + elem 
            else:
                if isinstance(elem,str):
                    df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + row2

    df.to_sql('data_rdset_pylily', db.connect, if_exists='append', index=False)

    db.connect.execute('''delete from {0} where rowid not in 
                         (select max (rowid) from {0} group by id)'''.format('data_rdset_pylily') )
    
    db.connect.commit()
    return
Пример #10
0
def check_time():
    import Lily.ctao.database as cdb
    import Lily.ctao.nsgstring as nstr
    import Lily.ctao.hostmetadata as chmd
    import re
    host = chmd.hostmetadata()
    db = cdb.database(host.database)
    #^\d\d\d\d-(0?[1-9]|1[0-2])-(0?[1-9]|[12][0-9]|3[01]) (00|[0-9]|1[0-9]|2[0-3]):([0-9]|[0-5][0-9]):([0-9]|[0-5][0-9])$

    patern0 = r'''(0?[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])'''
    patern1 = r'''([0-2][0-9]):([0-5][0-9])'''
    patern2 = r'''^(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])|(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])$'''

    df = db.to_dataframe('hln_0206_3')
    df = df.iloc[1:]

    for ind, row in df.iterrows():
        twoday = [day for day in re.findall(patern0, row[1])]
        twotim = [tim for tim in re.findall(patern1, row[2])]

        if len(twoday) == 0:
            twoday = [('01', '01'), ('01', '01')]

        if len(twoday) == 1:
            twoday = [twoday[0], twoday[0]]

        if len(twotim) == 0:
            twotim = [('00', '00'), ('00', '00')]

        if len(twotim) == 1:
            twotim = [twotim[0], twotim[0]]

        date1 = '2018-{0}-{1} {2}:{3}'.format(twoday[0][0], twoday[0][1],
                                              twotim[0][0], twotim[0][1])
        date2 = '2018-{0}-{1} {2}:{3}'.format(twoday[1][0], twoday[1][1],
                                              twotim[1][0], twotim[1][1])

        df.iloc[ind]['beg'] = datetime.datetime.strptime(date1, '%Y%m%d %H%M')
        df.iloc[ind]['end'] = datetime.datetime.strptime(date2, '%Y%m%d %H%M')
Пример #11
0
    def __init__(self):
        import Lily.ctao.database as cdb
        import Lily.ctao.hostmetadata as chmd

        self.this_host = chmd.hostmetadata()
        self.log_database = cdb.database(self.this_host.database)