예제 #1
0
    def load_csv(self, csvFile, tableName=None, delimiter=','):
        '''
        Load file into sqlite
        '''
        if not tableName:
            tableName = toolkit_file.get_basename(csvFile)

        with open(csvFile, 'r') as f:
            reader = csv.reader(f, delimiter=delimiter)
            header = next(reader)
        header = list(map(lambda x: x.strip().replace(' ', '_'), header))
        # print(header)
        drop_SQL = '''DROP TABLE IF EXISTS {}'''.format(tableName)
        self.execute(drop_SQL)
        chunks = pd.read_csv(csvFile,
                             chunksize=100000,
                             sep=delimiter,
                             dtype=str,
                             names=header,
                             header=0)
        for chunk in chunks:
            chunk.to_sql(name=tableName,
                         if_exists='append',
                         con=self.conn,
                         index=False)
def extract_info(html):
    '''Extract infomation from html'''
    infoDict = {
        'phone_number_prefix': int(toolkit_file.get_basename(html)),
        'province': None,
        'city': None,
        'card_type': None,
        'area_code': None,
        'postal_code': None,
    }

    # phone_number_prefix is not registered
    with open(html, encoding='utf-8') as f:
        if '验证手机号有误' in f.read():
            return infoDict

    df = pd.read_html(html)[0]
    tmp_dict = dict()
    for index, row in df.iterrows():
        tmp_dict[row[0]] = row[1]

    # print(tmp_dict)
    try:
        if isinstance(tmp_dict['卡号归属地'], float):
            # 卡号归属地 is empty
            province, city = None, None
        elif isinstance(tmp_dict['卡号归属地'], str) and len(
                tmp_dict['卡号归属地'].split(' ')) == 2:
            # 卡号归属地 is good, 江苏 南京
            province, city = tmp_dict['卡号归属地'].split(
                ' ')[0], tmp_dict['卡号归属地'].split(' ')[1]
        elif isinstance(tmp_dict['卡号归属地'], str) and len(
                tmp_dict['卡号归属地'].split(' ')) == 1:
            # 卡号归属地 only contains province
            province, city = tmp_dict['卡号归属地'].split(' ')[0], None

        infoDict['phone_number_prefix'] = int(tmp_dict['手机号码段'].replace(
            '*', ''))
        infoDict['province'] = province
        infoDict['city'] = city
        infoDict['card_type'] = tmp_dict['卡 类 型']
        infoDict['area_code'] = tmp_dict['区 号']
        infoDict['postal_code'] = tmp_dict['邮 编']
    except Exception as e:
        logging.error(type(tmp_dict['卡号归属地']))
        logging.error('Error when loading {}'.format(html))
        logging.error('Data {}'.format(tmp_dict))
        raise
    else:
        pass
    finally:
        pass
    return infoDict
예제 #3
0
def generate_file_list(dataset_dir):
    imgFileList = [
        x for x in toolkit_file.get_file_list(dataset_dir)
        if x.endswith('.jpg')
    ]
    # print(fileList)

    dataset_dict_list = []

    for file in imgFileList:
        pic_id = int(
            toolkit_file.get_basename(file, withExtension=False).replace(
                'image_', ''))
        group_id = (pic_id - 1) // 80
        dataset_dict_list.append({
            'pic_id': pic_id,
            'group_id': group_id,
            'image_path': file
        })
    return dataset_dict_list
예제 #4
0
    def load_json(self, JSON_FILE, tableName=None, full_refresh=True):
        '''
        Load file into sqlite
        '''
        if not tableName:
            tableName = toolkit_file.get_basename(JSON_FILE)

        with open(JSON_FILE, 'r', errors='ignore') as f:
            dicSet = json.load(f)["rows"]

        if full_refresh:
            drop_SQL = '''DROP TABLE IF EXISTS {}'''.format(tableName)
            self.execute(drop_SQL)

        df = pd.DataFrame(dicSet)
        df.to_sql(name=tableName,
                  if_exists='append',
                  con=self.conn,
                  index=False,
                  chunksize=20000)
예제 #5
0
    def load_json(self, JSON_FILE, tableName=None):
        '''
        Load file into sqlite
        '''
        if not tableName:
            tableName = toolkit_file.get_basename(JSON_FILE)

        with open(JSON_FILE, 'r') as f:
            dicSet = json.load(f)

        print('Load json {} to table {}'.format(JSON_FILE, tableName))
        tupleList = []
        columnNames = list(dicSet[0].keys())
        columnNamesSqlJoined = ', '.join(
            map(lambda x: '`' + x + '`', columnNames))

        for dic in dicSet:
            tupleList.append(tuple(dic.values()))

        insertSql = "INSERT INTO {} ({}) VALUES(?{});".format(
            tableName, columnNamesSqlJoined, ',?' * (len(tupleList[0]) - 1))

        self.executemany(insertSql, tupleList)
예제 #6
0
print(predict)
print(np.argmax(predict))

import os
import cv2
from matplotlib import pyplot as plt

for no, x in enumerate(predict):
    idx = np.argmax(x)
    print('Confidence: {}%'.format(x[idx] * 100))
    guess_img = 'image_{}.jpg'.format(str(idx * 80 + 1).zfill(4))
    guess_img_path = os.path.join(config.DATASET_DIR, guess_img)
    guess_img = cv2.imread(guess_img_path)
    guess_img = cv2.cvtColor(guess_img, cv2.COLOR_BGR2RGB)

    predict_img_path = predictList[no]
    print(predict_img_path)
    predict_img = cv2.imread(predict_img_path)
    predict_img = cv2.cvtColor(predict_img, cv2.COLOR_BGR2RGB)

    tag = [
        'predict_img' + ' ' +
        toolkit_file.get_basename(predict_img_path, withExtension=True),
        'guess_img'
    ]
    for j, i in enumerate([predict_img, guess_img]):
        plt.subplot(1, 2, j + 1)
        plt.imshow(i, cmap='gray', vmin=0, vmax=255)
        plt.xlabel(tag[j])
    plt.show()
def batch_generate(font_file):
    basename = toolkit_file.get_basename(font_file)
    for order in char_set:
        generate_image(font_file, chr(order)).save(
            os.path.join(training_data_dir, str(order),
                         '{}_{}.jpg'.format(basename, order)))
예제 #8
0

if __name__ == '__main__':
    toolkit_file.purge_folder('logs')
    shape = X_dataset.shape
    model = buildModel(shape)

    callback = EarlyStopping(monitor="loss",
                             patience=30,
                             verbose=1,
                             mode="auto")
    tbCallBack = TensorBoard(
        log_dir='./logs',  # log 目录
        histogram_freq=0,  # 按照何等频率(epoch)来计算直方图,0为不计算
        #                  batch_size=batch_size,     # 用多大量的数据计算直方图
        write_graph=True,  # 是否存储网络结构图
        write_grads=True,  # 是否可视化梯度直方图
        write_images=True,  # 是否可视化参数
        embeddings_freq=0,
        embeddings_layer_names=None,
        embeddings_metadata=None)

    model.fit(X_dataset,
              Y_dataset,
              epochs=1000,
              shuffle=True,
              batch_size=batch_size,
              validation_split=0.1,
              callbacks=[callback, tbCallBack],
              run_id=toolkit_file.get_basename(model_name))
    model.save(model_name)