def load_csv(self, csvFile, tableName=None, delimiter=','): ''' Load file into sqlite ''' if not tableName: tableName = toolkit_file.get_basename(csvFile) with open(csvFile, 'r') as f: reader = csv.reader(f, delimiter=delimiter) header = next(reader) header = list(map(lambda x: x.strip().replace(' ', '_'), header)) # print(header) drop_SQL = '''DROP TABLE IF EXISTS {}'''.format(tableName) self.execute(drop_SQL) chunks = pd.read_csv(csvFile, chunksize=100000, sep=delimiter, dtype=str, names=header, header=0) for chunk in chunks: chunk.to_sql(name=tableName, if_exists='append', con=self.conn, index=False)
def extract_info(html): '''Extract infomation from html''' infoDict = { 'phone_number_prefix': int(toolkit_file.get_basename(html)), 'province': None, 'city': None, 'card_type': None, 'area_code': None, 'postal_code': None, } # phone_number_prefix is not registered with open(html, encoding='utf-8') as f: if '验证手机号有误' in f.read(): return infoDict df = pd.read_html(html)[0] tmp_dict = dict() for index, row in df.iterrows(): tmp_dict[row[0]] = row[1] # print(tmp_dict) try: if isinstance(tmp_dict['卡号归属地'], float): # 卡号归属地 is empty province, city = None, None elif isinstance(tmp_dict['卡号归属地'], str) and len( tmp_dict['卡号归属地'].split(' ')) == 2: # 卡号归属地 is good, 江苏 南京 province, city = tmp_dict['卡号归属地'].split( ' ')[0], tmp_dict['卡号归属地'].split(' ')[1] elif isinstance(tmp_dict['卡号归属地'], str) and len( tmp_dict['卡号归属地'].split(' ')) == 1: # 卡号归属地 only contains province province, city = tmp_dict['卡号归属地'].split(' ')[0], None infoDict['phone_number_prefix'] = int(tmp_dict['手机号码段'].replace( '*', '')) infoDict['province'] = province infoDict['city'] = city infoDict['card_type'] = tmp_dict['卡 类 型'] infoDict['area_code'] = tmp_dict['区 号'] infoDict['postal_code'] = tmp_dict['邮 编'] except Exception as e: logging.error(type(tmp_dict['卡号归属地'])) logging.error('Error when loading {}'.format(html)) logging.error('Data {}'.format(tmp_dict)) raise else: pass finally: pass return infoDict
def generate_file_list(dataset_dir): imgFileList = [ x for x in toolkit_file.get_file_list(dataset_dir) if x.endswith('.jpg') ] # print(fileList) dataset_dict_list = [] for file in imgFileList: pic_id = int( toolkit_file.get_basename(file, withExtension=False).replace( 'image_', '')) group_id = (pic_id - 1) // 80 dataset_dict_list.append({ 'pic_id': pic_id, 'group_id': group_id, 'image_path': file }) return dataset_dict_list
def load_json(self, JSON_FILE, tableName=None, full_refresh=True): ''' Load file into sqlite ''' if not tableName: tableName = toolkit_file.get_basename(JSON_FILE) with open(JSON_FILE, 'r', errors='ignore') as f: dicSet = json.load(f)["rows"] if full_refresh: drop_SQL = '''DROP TABLE IF EXISTS {}'''.format(tableName) self.execute(drop_SQL) df = pd.DataFrame(dicSet) df.to_sql(name=tableName, if_exists='append', con=self.conn, index=False, chunksize=20000)
def load_json(self, JSON_FILE, tableName=None): ''' Load file into sqlite ''' if not tableName: tableName = toolkit_file.get_basename(JSON_FILE) with open(JSON_FILE, 'r') as f: dicSet = json.load(f) print('Load json {} to table {}'.format(JSON_FILE, tableName)) tupleList = [] columnNames = list(dicSet[0].keys()) columnNamesSqlJoined = ', '.join( map(lambda x: '`' + x + '`', columnNames)) for dic in dicSet: tupleList.append(tuple(dic.values())) insertSql = "INSERT INTO {} ({}) VALUES(?{});".format( tableName, columnNamesSqlJoined, ',?' * (len(tupleList[0]) - 1)) self.executemany(insertSql, tupleList)
print(predict) print(np.argmax(predict)) import os import cv2 from matplotlib import pyplot as plt for no, x in enumerate(predict): idx = np.argmax(x) print('Confidence: {}%'.format(x[idx] * 100)) guess_img = 'image_{}.jpg'.format(str(idx * 80 + 1).zfill(4)) guess_img_path = os.path.join(config.DATASET_DIR, guess_img) guess_img = cv2.imread(guess_img_path) guess_img = cv2.cvtColor(guess_img, cv2.COLOR_BGR2RGB) predict_img_path = predictList[no] print(predict_img_path) predict_img = cv2.imread(predict_img_path) predict_img = cv2.cvtColor(predict_img, cv2.COLOR_BGR2RGB) tag = [ 'predict_img' + ' ' + toolkit_file.get_basename(predict_img_path, withExtension=True), 'guess_img' ] for j, i in enumerate([predict_img, guess_img]): plt.subplot(1, 2, j + 1) plt.imshow(i, cmap='gray', vmin=0, vmax=255) plt.xlabel(tag[j]) plt.show()
def batch_generate(font_file): basename = toolkit_file.get_basename(font_file) for order in char_set: generate_image(font_file, chr(order)).save( os.path.join(training_data_dir, str(order), '{}_{}.jpg'.format(basename, order)))
if __name__ == '__main__': toolkit_file.purge_folder('logs') shape = X_dataset.shape model = buildModel(shape) callback = EarlyStopping(monitor="loss", patience=30, verbose=1, mode="auto") tbCallBack = TensorBoard( log_dir='./logs', # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 # batch_size=batch_size, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=True, # 是否可视化梯度直方图 write_images=True, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) model.fit(X_dataset, Y_dataset, epochs=1000, shuffle=True, batch_size=batch_size, validation_split=0.1, callbacks=[callback, tbCallBack], run_id=toolkit_file.get_basename(model_name)) model.save(model_name)