def loadFnBtn(self): # 利用Webdings字体来显示图标 font = self.font() or QFont() font.setFamily('Webdings') # 添加新的标签页 self.buttonAddPage = QPushButton(unichr(0xf067), self, clicked=self.addPaged.emit, font=fontawesome("far"), objectName='buttonAddPage') # self.buttonAddPage.setIconSize(QSize(16,16)) # self.buttonAddPage.setIcon(QIcon(':/icons/plus.png')) self.layout.addWidget(self.buttonAddPage) # 最小化按钮 self.buttonMinimum = QPushButton( '0', self, clicked=self.windowMinimumed.emit, font=font, objectName='buttonMinimum') self.layout.addWidget(self.buttonMinimum) # 最大化/还原按钮 self.buttonMaximum = QPushButton( '1', self, clicked=self.showMaximized, font=font, objectName='buttonMaximum') self.layout.addWidget(self.buttonMaximum) # 关闭按钮 self.buttonClose = QPushButton( 'r', self, clicked=self.windowClosed.emit, font=font, objectName='buttonClose') self.layout.addWidget(self.buttonClose) # 初始高度 self.setHeight()
def fullToHalfText(ustring): rstring = "" for uchar in ustring: inside_code=ord(uchar) if inside_code == 12288: inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): inside_code -= 65248 rstring += unichr(inside_code) return rstring
def quanToBan(quan_String): # 全角转半角 res_String = "" for uchar in quan_String: inside_code=ord(uchar) if inside_code == 12288: #全角空格直接转换 inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): #全角字符(除空格)根据关系转化 inside_code -= 65248 res_String += unichr(inside_code) return res_String
def strQ2B(ustring): """全角转半角""" rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 12288: #全角空格直接转换 inside_code = 32 elif (inside_code >= 65281 and inside_code <= 65374): #全角字符(除空格)根据关系转化 inside_code -= 65248 rstring += unichr(inside_code) return rstring
def strB2Q(ustring): """半角转全角""" rstrings = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 32: # 半角空格直接转化 inside_code = 12288 elif 32 <= inside_code <= 126: # 半角字符(除空格)根据关系转化 inside_code += 65248 rstrings += unichr(inside_code) return rstrings
def entityref(self, c): if not self.unicode_snob and c in unifiable.keys(): return unifiable[c] else: try: name2cp(c) except KeyError: return "&" + c + ';' else: try: return unichr(name2cp(c)) except NameError: #Python3 return chr(name2cp(c))
def charref(self, name): if name[0] in ['x', 'X']: c = int(name[1:], 16) else: c = int(name) if not self.unicode_snob and c in unifiable_n.keys(): return unifiable_n[c] else: try: return unichr(c) except NameError: #Python3 return chr(c)
def close(self): HTMLParser.HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if self.unicode_snob: nbsp = unichr(name2cp('nbsp')) else: nbsp = u' ' self.outtext = self.outtext.replace(u' _place_holder;', nbsp) return self.outtext
def rightMenuShow(self): try: self.contextMenu = QMenu() self.contextMenu.setFont(fontawesome("far")) index_action = QWidgetAction() index_button = QPushButton(unichr(0xf015), # clicked=self.zoom_out_func, font=fontawesome("far"), ) index_button.setToolTip("主页") index_button.setCursor(Qt.ArrowCursor) index_action.setDefaultWidget(index_button) self.actionA = self.contextMenu.addAction(index_action) self.contextMenu.popup(QCursor.pos()) # 2菜单显示的位置 # self.actionA.triggered.connect(self.actionHandler) self.contextMenu.show() except Exception as e: print(e)
"b": 1, "c": 2, "d": 3, "e": 4, "f": 5, "g": 6, "h": 7 } template = \ """| {} """ for i in range(8): for j in range(8): board[i].append(template.format(" ")) whiteFigures = { "WQ": u''.join(unichr(9813)), "WB": u''.join(unichr(9815)), "WKn": u''.join(unichr(9816)), "WR": u''.join(unichr(9814)), "WK": u''.join(unichr(9812)), "WP": u''.join(unichr(9817)), } blackFigures = { "BK": u''.join(unichr(9818)), "BQ": u''.join(unichr(9819)), "BB": u''.join(unichr(9821)), "BKn": u''.join(unichr(9822)), "BR": u''.join(unichr(9820)), "BP": u''.join(unichr(9823)) }
def __init__(self): super(Window, self).__init__() self.setWindowTitle("Smart mirror") self.setAutoFillBackground(True) p = self.palette() p.setColor(self.backgroundRole(), Qt.black) self.degree = unichr(176) self.redrawCount = 0 self.calendarRefreshCount = 0 self.setPalette(p) self.prevTrainLabel = QLabel(self) self.prevTrainLabel1 = QLabel(self) self.prevTrainLabel2 = QLabel(self) self.prevTrainLabel3 = QLabel(self) self.nextTrainLabel = QLabel(self) self.nextTrainLabel1 = QLabel(self) self.nextTrainLabel2 = QLabel(self) self.nextTrainLabel3 = QLabel(self) self.nextTrain1Label = QLabel(self) self.nextTrain1Label1 = QLabel(self) self.nextTrain1Label2 = QLabel(self) self.nextTrain1Label3 = QLabel(self) self.nextTrain2Label = QLabel(self) self.nextTrain2Label1 = QLabel(self) self.nextTrain2Label2 = QLabel(self) self.nextTrain2Label3 = QLabel(self) self.startTime = QLabel(self) self.lateTime = QLabel(self) self.arrivalTime = QLabel(self) self.arrivalStation = QLabel(self) self.line = QLabel(self) self.time = QLabel(self) self.date = QLabel(self) self.temperature = QLabel(self) self.weather = [QLabel(self), QLabel(self), QLabel(self)] self.clouds = QLabel(self) self.wind = QLabel(self) self.weatherForecastWeek = [] self.forecastSeparatorLine = [] self.forecastSeparatorPartOfTheDay = [] self.forecastDayIdentify = [] for i in range(4): self.forecastSeparatorLine.append(QLabel(self)) self.forecastSeparatorPartOfTheDay.append(QLabel(self)) for i in range(5): self.forecastDayIdentify.append(QLabel(self)) for i in range(5 * forecastRows): self.weatherForecastWeek.append(QLabel(self)) self.calendarEntries = [] for i in range(15): self.calendarEntries.append(QLabel(self)) self.trainLabelFont = QtGui.QFont("Times", 35, QtGui.QFont.Normal) self.dayLabelFont = QtGui.QFont("Times", 30, QtGui.QFont.Normal) self.timeLabelFont = QtGui.QFont("Times", 150, QtGui.QFont.Normal) self.tempLabelFont = QtGui.QFont("Times", 100, QtGui.QFont.Normal) self.dateLabelFont = QtGui.QFont("Times", 45, QtGui.QFont.Normal) self.forecastLabelFont = QtGui.QFont("Times", 20, QtGui.QFont.Normal) self.forecastLabelFontBold = QtGui.QFont("Times", 20, QtGui.QFont.Bold) exit_action = QAction("", self) exit_action.setShortcut("Ctrl+Q") exit_action.setStatusTip('') exit_action.triggered.connect(self.showNormalAndExit) exit_full_screen = QAction("", self) exit_full_screen.setShortcut("Ctrl+X") exit_full_screen.setStatusTip('') exit_full_screen.triggered.connect(self.showNormal) return_full_screen = QAction("", self) return_full_screen.setShortcut("Ctrl+A") return_full_screen.setStatusTip('') return_full_screen.triggered.connect(self.showFullScreen) self.colorWhite = 'color: white' self.colorGrey = 'color: grey' self.colorRed = 'color: red' self.colorDarkGrey = 'color: #1e1e1e' self.statusBar() self.setStyleSheet(""" QMenuBar { background-color: rgb(0,0,0); color: rgb(255,255,255); border: 1px solid #000; } QMenuBar::item { background-color: rgb(0,0,0); color: rgb(255,255,255); } QMenuBar::item::selected { background-color: rgb(0,0,0); } QMenu { background-color: rgb(0,0,0); color: rgb(255,255,255); border: 1px solid #000; } QMenu::item::selected { background-color: rgb(0,0,0); } """) mainMenu = self.menuBar() fileMenu = mainMenu.addMenu('') fileMenu.addAction(exit_action) fileMenu.addAction(exit_full_screen) fileMenu.addAction(return_full_screen) mainMenu.resize(0, 0) self.showFullScreen() self.trainData = TrainData() self.weatherData = WeatherData() self.setTrainTexts() self.initTrainsLabel() self.initDateTimeLabels() self.initWeatherLabels() self.initForecastWeatherLabels() self.initCalendarLabels() self.setTrains() self.setDateTime() self.setWeather() self.setForecastWeather() self.setCalendar()
def initToolbar(self, webview): pass ###使用QToolBar创建导航栏,并使用QAction创建按钮 # 添加导航栏 self.navigation_bar = QToolBar('Navigation') # 锁定导航栏 self.navigation_bar.setMovable(False) # 设定图标的大小 self.navigation_bar.setIconSize(QSize(2, 2)) # 添加导航栏到窗口中 self.addToolBar(self.navigation_bar) # 添加其它配置 self.navigation_bar.setObjectName("navigation_bar") self.navigation_bar.setCursor(Qt.ArrowCursor) # QAction类提供了抽象的用户界面action,这些action可以被放置在窗口部件中 # 添加前进、后退、停止加载和刷新的按钮 self.reload_icon = unichr(0xf2f9) self.stop_icon = unichr(0xf00d) # 后退按钮 self.back_action = QWidgetAction(self) self.back_button = QPushButton(unichr(0xf060), self, clicked=webview.back, font=fontawesome("far"), objectName='back_button') self.back_button.setToolTip("后退") self.back_button.setCursor(Qt.ArrowCursor) self.back_action.setDefaultWidget(self.back_button) # 前进按钮 self.next_action = QWidgetAction(self) self.next_button = QPushButton(unichr(0xf061), self, clicked=webview.forward, font=fontawesome("far"), objectName='next_button') self.next_button.setToolTip("前进") self.next_button.setCursor(Qt.ArrowCursor) self.next_action.setDefaultWidget(self.next_button) # 刷新与停止按钮 self.reload_action = QWidgetAction(self) self.reload_button = QPushButton(self.reload_icon, self, clicked=webview.reload, font=fontawesome("far"), objectName='reload_button') self.reload_button.setToolTip("刷新") self.reload_button.setCursor(Qt.ArrowCursor) self.reload_action.setDefaultWidget(self.reload_button) # 放大按钮 self.zoom_in_button = QPushButton(unichr(0xf067), self, clicked=self.zoom_in_func, font=fontawesome("far"), objectName='zoom_in_btn') self.zoom_in_button.setToolTip("放大") self.zoom_in_button.setCursor(Qt.ArrowCursor) # 缩小按钮 self.zoom_out_button = QPushButton(unichr(0xf068), self, clicked=self.zoom_out_func, font=fontawesome("far"), objectName='zoom_out_btn') self.zoom_out_button.setToolTip("缩小") self.zoom_out_button.setCursor(Qt.ArrowCursor) self.sf_label_rate = QLabel() self.sf_label_rate.setObjectName("sf_label_rate") self.sf_label_rate.setFixedWidth(30) self.sf_label_rate.setAlignment(QtCore.Qt.AlignRight|QtCore.Qt.AlignVCenter) self.sf_label_rate.setProperty("class","qlabel") self.sf_label_rate.setText(str(int(self.webview.zoomFactor()*100))+"%") # 全屏按钮 self.full_screen_button = QPushButton(unichr(0xe140), self, clicked=self.full_screen_func, font=fontawesome("boot"), objectName='full_screen_button') self.full_screen_button.setToolTip("全屏") self.full_screen_button.setCursor(Qt.ArrowCursor) # 其它按钮 self.more_action = QWidgetAction(self) self.more_button = QPushButton(unichr(0xe235), self, clicked=self.moreMenuShow, font=fontawesome("boot"), objectName='more_button') self.more_button.setToolTip("页面控制及浏览器核心") self.more_button.setCursor(Qt.ArrowCursor) self.more_action.setDefaultWidget(self.more_button) # 首页按钮 self.index_action = QWidgetAction(self) self.index_button = QPushButton(unichr(0xf015), self, # clicked=self.zoom_out_func, font=fontawesome("far"), objectName='index_button') self.index_button.setToolTip("主页") self.index_button.setCursor(Qt.ArrowCursor) self.index_action.setDefaultWidget(self.index_button) # self.back_button.triggered.connect(webview.back) # self.next_button.triggered.connect(webview.forward) # self.reload_button.triggered.connect(webview.reload) # self.zoom_in_btn.triggered.connect(self.zoom_in_func) # self.zoom_out_btn.triggered.connect(self.zoom_out_func) # 将按钮添加到导航栏上 self.navigation_bar.addAction(self.back_action) self.navigation_bar.addAction(self.next_action) self.navigation_bar.addAction(self.reload_action) self.navigation_bar.addAction(self.index_action) # 添加URL地址栏 self.urlbar = QLineEdit() # 让地址栏能响应回车按键信号 self.urlbar.returnPressed.connect(self.navigate_to_url) # self.navigation_bar.addSeparator() self.navigation_bar.addWidget(self.urlbar) # self.navigation_bar.addSeparator() # self.navigation_bar.addAction(self.zoom_in_action) # self.navigation_bar.addAction(self.zoom_out_action) self.navigation_bar.addAction(self.more_action) # 让浏览器相应url地址的变化 webview.urlChanged.connect(self.renew_urlbar) webview.loadProgress.connect(self.processLoad) webview.loadStarted.connect(self.loadPage) webview.loadFinished.connect(self.loadFinish) webview.titleChanged.connect(self.renew_title) webview.iconChanged.connect(self.renew_icon) self.webBind() webview.show() self.navigation_bar.setIconSize(QSize(20, 20)) self.urlbar.setFont(QFont('SansSerif', 13))
class Configuration(object): CONFIG = Configuration() CONFIG.input_layers = 2 CONFIG.output_layers = 2 CONFIG.amount_of_dropout = 0.2 CONFIG.hidden_size = 500 CONFIG.initialization = "he_normal" CONFIG.number_of_chars = 100 CONFIG.max_input_len = 60 CONFIG.inverted = True # parameters for the training: CONFIG.batch_size = 100 # As the model changes in size, play with the batch size to best fit the process in memory CONFIG.epochs = 500 # due to mini-epochs. CONFIG.steps_per_epoch = 1000 # This is a mini-epoch. Using News 2013 an epoch would need to be ~60K. CONFIG.validation_steps = 10 CONFIG.number_of_iterations = 10 #pylint:enable=attribute-defined-outside-init DIGEST = sha256(json.dumps(CONFIG.__dict__, sort_keys=True)).hexdigest() # Parameters for the dataset MIN_INPUT_LEN = 5 AMOUNT_OF_NOISE = 0.2 / CONFIG.max_input_len CHARS = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .") PADDING = "☕" DATA_FILES_PATH = "~/Downloads/data" DATA_FILES_FULL_PATH = os.path.expanduser(DATA_FILES_PATH) DATA_FILES_URL = "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.en.shuffled.gz" NEWS_FILE_NAME_COMPRESSED = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.shuffled.gz") # 1.1 GB NEWS_FILE_NAME_ENGLISH = "news.2013.en.shuffled" NEWS_FILE_NAME = os.path.join(DATA_FILES_FULL_PATH, NEWS_FILE_NAME_ENGLISH) NEWS_FILE_NAME_CLEAN = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.clean") NEWS_FILE_NAME_FILTERED = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.filtered") NEWS_FILE_NAME_SPLIT = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.split") NEWS_FILE_NAME_TRAIN = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.train") NEWS_FILE_NAME_VALIDATE = os.path.join(DATA_FILES_FULL_PATH, "news.2013.en.validate") CHAR_FREQUENCY_FILE_NAME = os.path.join(DATA_FILES_FULL_PATH, "char_frequency.json") SAVED_MODEL_FILE_NAME = os.path.join(DATA_FILES_FULL_PATH, "keras_spell_e{}.h5") # an HDF5 file # Some cleanup: NORMALIZE_WHITESPACE_REGEX = re.compile(r'[^\S\n]+', re.UNICODE) # match all whitespace except newlines RE_DASH_FILTER = re.compile(r'[\-\˗\֊\‐\‑\‒\–\—\⁻\₋\−\﹣\-]', re.UNICODE) RE_APOSTROPHE_FILTER = re.compile(r''|[ʼ՚'‘’‛❛❜ߴߵ`‵´ˊˋ{}{}{}{}{}{}{}{}{}]'.format(unichr(768), unichr(769), unichr(832), unichr(833), unichr(2387), unichr(5151), unichr(5152), unichr(65344), unichr(8242)), re.UNICODE) RE_LEFT_PARENTH_FILTER = re.compile(r'[\(\[\{\⁽\₍\❨\❪\﹙\(]', re.UNICODE) RE_RIGHT_PARENTH_FILTER = re.compile(r'[\)\]\}\⁾\₎\❩\❫\﹚\)]', re.UNICODE) ALLOWED_CURRENCIES = """¥£₪$€฿₨""" ALLOWED_PUNCTUATION = """-!?/;"'%&<>.()[]{}@#:,|=*""" RE_BASIC_CLEANER = re.compile(r'[^\w\s{}{}]'.format(re.escape(ALLOWED_CURRENCIES), re.escape(ALLOWED_PUNCTUATION)), re.UNICODE) # pylint:disable=invalid-name def download_the_news_data(): """Download the news data""" LOGGER.info("Downloading") try: os.makedirs(os.path.dirname(NEWS_FILE_NAME_COMPRESSED)) except OSError as exception: if exception.errno != errno.EEXIST: raise with open(NEWS_FILE_NAME_COMPRESSED, "wb") as output_file: response = requests.get(DATA_FILES_URL, stream=True) total_length = response.headers.get('content-length') downloaded = percentage = 0 print("»"*100) total_length = int(total_length) for data in response.iter_content(chunk_size=4096): downloaded += len(data) output_file.write(data) new_percentage = 100 * downloaded // total_length if new_percentage > percentage: print("☑", end="") percentage = new_percentage print() def uncompress_data(): """Uncompress the data files""" import gzip with gzip.open(NEWS_FILE_NAME_COMPRESSED, 'rb') as compressed_file: with open(NEWS_FILE_NAME_COMPRESSED[:-3], 'wb') as outfile: outfile.write(compressed_file.read()) def add_noise_to_string(a_string, amount_of_noise): """Add some artificial spelling mistakes to the string""" if rand() < amount_of_noise * len(a_string): # Replace a character with a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position + 1:] if rand() < amount_of_noise * len(a_string): # Delete a character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + a_string[random_char_position + 1:] if len(a_string) < CONFIG.max_input_len and rand() < amount_of_noise * len(a_string): # Add a random character random_char_position = random_randint(len(a_string)) a_string = a_string[:random_char_position] + random_choice(CHARS[:-1]) + a_string[random_char_position:] if rand() < amount_of_noise * len(a_string): # Transpose 2 characters random_char_position = random_randint(len(a_string) - 1) a_string = (a_string[:random_char_position] + a_string[random_char_position + 1] + a_string[random_char_position] + a_string[random_char_position + 2:]) return a_string def _vectorize(questions, answers, ctable): len_of_questions = len(questions) X = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=np.bool) for i in xrange(len(questions)): sentence = questions.pop() for j, c in enumerate(sentence): try: X[i, j, ctable.char_indices[c]] = 1 except KeyError: pass # Padding y = np_zeros((len_of_questions, CONFIG.max_input_len, ctable.size), dtype=np.bool) for i in xrange(len(answers)): sentence = answers.pop() for j, c in enumerate(sentence): try: y[i, j, ctable.char_indices[c]] = 1 except KeyError: pass # Padding return X, y def slice_X(X, start=None, stop=None): """This takes an array-like, or a list of array-likes, and outputs: - X[start:stop] if X is an array-like - [x[start:stop] for x in X] if X in a list Can also work on list/array of indices: `slice_X(x, indices)` # Arguments start: can be an integer index (start index) or a list/array of indices stop: integer (stop index); should be None if `start` was a list. """ if isinstance(X, list): if hasattr(start, '__len__'): # hdf5 datasets only support list objects as indices if hasattr(start, 'shape'): start = start.tolist() return [x[start] for x in X] else: return [x[start:stop] for x in X] else: if hasattr(start, '__len__'): if hasattr(start, 'shape'): start = start.tolist() return X[start] else: return X[start:stop] def vectorize(questions, answers, chars=None): """Vectorize the questions and expected answers""" print('Vectorization...') chars = chars or CHARS ctable = CharacterTable(chars) X, y = _vectorize(questions, answers, ctable) # Explicitly set apart 10% for validation data that we never train over split_at = int(len(X) - len(X) / 10) (X_train, X_val) = (slice_X(X, 0, split_at), slice_X(X, split_at)) (y_train, y_val) = (y[:split_at], y[split_at:]) print(X_train.shape) print(y_train.shape) return X_train, X_val, y_train, y_val, CONFIG.max_input_len, ctable def generate_model(output_len, chars=None): """Generate the model""" print('Build model...') chars = chars or CHARS model = Sequential() # "Encode" the input sequence using an RNN, producing an output of hidden_size # note: in a situation where your input sequences have a variable length, # use input_shape=(None, nb_feature). for layer_number in range(CONFIG.input_layers): model.add(recurrent.LSTM(CONFIG.hidden_size, input_shape=(None, len(chars)), kernel_initializer=CONFIG.initialization, return_sequences=layer_number + 1 < CONFIG.input_layers)) model.add(Dropout(CONFIG.amount_of_dropout)) # For the decoder's input, we repeat the encoded input for each time step model.add(RepeatVector(output_len)) # The decoder RNN could be multiple layers stacked or a single layer for _ in range(CONFIG.output_layers): model.add(recurrent.LSTM(CONFIG.hidden_size, return_sequences=True, kernel_initializer=CONFIG.initialization)) model.add(Dropout(CONFIG.amount_of_dropout)) # For each of step of the output sequence, decide which character should be chosen model.add(TimeDistributed(Dense(len(chars), kernel_initializer=CONFIG.initialization))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model class Colors(object): """For nicer printouts""" green = '\033[92m' red = '\033[91m' close = '\033[0m' class CharacterTable(object): """ Given a set of characters: + Encode them to a one hot integer representation + Decode the one hot integer representation to their character output + Decode a vector of probabilities to their character output """ def __init__(self, chars): self.chars = sorted(set(chars)) self.char_indices = dict((c, i) for i, c in enumerate(self.chars)) self.indices_char = dict((i, c) for i, c in enumerate(self.chars)) @property def size(self): """The number of chars""" return len(self.chars) def encode(self, C, maxlen): """Encode as one-hot""" X = np_zeros((maxlen, len(self.chars)), dtype=np.bool) # pylint:disable=no-member for i, c in enumerate(C): X[i, self.char_indices[c]] = 1 return X def decode(self, X, calc_argmax=True): """Decode from one-hot""" if calc_argmax: X = X.argmax(axis=-1) return ''.join(self.indices_char[x] for x in X if x) def generator(file_name): """Returns a tuple (inputs, targets) All arrays should contain the same number of samples. The generator is expected to loop over its data indefinitely. An epoch finishes when samples_per_epoch samples have been seen by the model. """ ctable = CharacterTable(read_top_chars()) batch_of_answers = [] while True: with open(file_name) as answers: for answer in answers: batch_of_answers.append(answer.strip().decode('utf-8')) if len(batch_of_answers) == CONFIG.batch_size: random_shuffle(batch_of_answers) batch_of_questions = [] for answer_index, answer in enumerate(batch_of_answers): question, answer = generate_question(answer) batch_of_answers[answer_index] = answer assert len(answer) == CONFIG.max_input_len question = question[::-1] if CONFIG.inverted else question batch_of_questions.append(question) X, y = _vectorize(batch_of_questions, batch_of_answers, ctable) yield X, y batch_of_answers = [] def print_random_predictions(model, ctable, X_val, y_val): """Select 10 samples from the validation set at random so we can visualize errors""" print() for _ in range(10): ind = random_randint(0, len(X_val)) rowX, rowy = X_val[np.array([ind])], y_val[np.array([ind])] # pylint:disable=no-member preds = model.predict_classes(rowX, verbose=0) q = ctable.decode(rowX[0]) correct = ctable.decode(rowy[0]) guess = ctable.decode(preds[0], calc_argmax=False) if CONFIG.inverted: print('Q', q[::-1]) # inverted back! else: print('Q', q) print('A', correct) print(Colors.green + '☑' + Colors.close if correct == guess else Colors.red + '☒' + Colors.close, guess) print('---') print() class OnEpochEndCallback(Callback): """Execute this every end of epoch""" def on_epoch_end(self, epoch, logs=None): """On Epoch end - do some stats""" ctable = CharacterTable(read_top_chars()) X_val, y_val = next(generator(NEWS_FILE_NAME_VALIDATE)) print_random_predictions(self.model, ctable, X_val, y_val) self.model.save(SAVED_MODEL_FILE_NAME.format(epoch)) ON_EPOCH_END_CALLBACK = OnEpochEndCallback() def itarative_train(model): """ Iterative training of the model - To allow for finite RAM... - To allow infinite training data as the training noise is injected in runtime """ model.fit_generator(generator(NEWS_FILE_NAME_TRAIN), steps_per_epoch=CONFIG.steps_per_epoch, epochs=CONFIG.epochs, verbose=1, callbacks=[ON_EPOCH_END_CALLBACK, ], validation_data=generator(NEWS_FILE_NAME_VALIDATE), validation_steps=CONFIG.validation_steps, class_weight=None, max_q_size=10, workers=1, pickle_safe=False, initial_epoch=0) def iterate_training(model, X_train, y_train, X_val, y_val, ctable): """Iterative Training""" # Train the model each generation and show predictions against the validation dataset for iteration in range(1, CONFIG.number_of_iterations): print() print('-' * 50) print('Iteration', iteration) model.fit(X_train, y_train, batch_size=CONFIG.batch_size, epochs=CONFIG.epochs, validation_data=(X_val, y_val)) print_random_predictions(model, ctable, X_val, y_val) def clean_text(text): """Clean the text - remove unwanted chars, fold punctuation etc.""" result = NORMALIZE_WHITESPACE_REGEX.sub(' ', text.strip()) result = RE_DASH_FILTER.sub('-', result) result = RE_APOSTROPHE_FILTER.sub("'", result) result = RE_LEFT_PARENTH_FILTER.sub("(", result) result = RE_RIGHT_PARENTH_FILTER.sub(")", result) result = RE_BASIC_CLEANER.sub('', result) return result def preprocesses_data_clean(): """Pre-process the data - step 1 - cleanup""" with open(NEWS_FILE_NAME_CLEAN, "wb") as clean_data: for line in open(NEWS_FILE_NAME): decoded_line = line.decode('utf-8') cleaned_line = clean_text(decoded_line) encoded_line = cleaned_line.encode("utf-8") clean_data.write(encoded_line + b"\n") def preprocesses_data_analyze_chars(): """Pre-process the data - step 2 - analyze the characters""" counter = Counter() LOGGER.info("Reading data:") for line in open(NEWS_FILE_NAME_CLEAN): decoded_line = line.decode('utf-8') counter.update(decoded_line) # data = open(NEWS_FILE_NAME_CLEAN).read().decode('utf-8') # LOGGER.info("Read.\nCounting characters:") # counter = Counter(data.replace("\n", "")) LOGGER.info("Done.\nWriting to file:") with open(CHAR_FREQUENCY_FILE_NAME, 'wb') as output_file: output_file.write(json.dumps(counter)) most_popular_chars = {key for key, _value in counter.most_common(CONFIG.number_of_chars)} LOGGER.info("The top %s chars are:", CONFIG.number_of_chars) LOGGER.info("".join(sorted(most_popular_chars))) def read_top_chars(): """Read the top chars we saved to file""" chars = json.loads(open(CHAR_FREQUENCY_FILE_NAME).read()) counter = Counter(chars) most_popular_chars = {key for key, _value in counter.most_common(CONFIG.number_of_chars)} return most_popular_chars def preprocesses_data_filter(): """Pre-process the data - step 3 - filter only sentences with the right chars""" most_popular_chars = read_top_chars() LOGGER.info("Reading and filtering data:") with open(NEWS_FILE_NAME_FILTERED, "wb") as output_file: for line in open(NEWS_FILE_NAME_CLEAN): decoded_line = line.decode('utf-8') if decoded_line and not bool(set(decoded_line) - most_popular_chars): output_file.write(line) LOGGER.info("Done.") def read_filtered_data(): """Read the filtered data corpus""" LOGGER.info("Reading filtered data:") lines = open(NEWS_FILE_NAME_FILTERED).read().decode('utf-8').split("\n") LOGGER.info("Read filtered data - %s lines", len(lines)) return lines def preprocesses_split_lines(): """Preprocess the text by splitting the lines between min-length and max_length I don't like this step: I think the start-of-sentence is important. I think the end-of-sentence is important. Sometimes the stripped down sub-sentence is missing crucial context. Important NGRAMs are cut (though given enough data, that might be moot). I do this to enable batch-learning by padding to a fixed length. """ LOGGER.info("Reading filtered data:") answers = set() with open(NEWS_FILE_NAME_SPLIT, "wb") as output_file: for _line in open(NEWS_FILE_NAME_FILTERED): line = _line.decode('utf-8') while len(line) > MIN_INPUT_LEN: if len(line) <= CONFIG.max_input_len: answer = line line = "" else: space_location = line.rfind(" ", MIN_INPUT_LEN, CONFIG.max_input_len - 1) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue answers.add(answer) output_file.write(answer.encode('utf-8') + b"\n") def preprocesses_split_lines2(): """Preprocess the text by splitting the lines between min-length and max_length Alternative split. """ LOGGER.info("Reading filtered data:") answers = set() for encoded_line in open(NEWS_FILE_NAME_FILTERED): line = encoded_line.decode('utf-8') if CONFIG.max_input_len >= len(line) > MIN_INPUT_LEN: answers.add(line) LOGGER.info("There are %s 'answers' (sub-sentences)", len(answers)) LOGGER.info("Here are some examples:") for answer in itertools.islice(answers, 10): LOGGER.info(answer) with open(NEWS_FILE_NAME_SPLIT, "wb") as output_file: output_file.write("".join(answers).encode('utf-8')) def preprocesses_split_lines3(): """Preprocess the text by selecting only max n-grams Alternative split. """ LOGGER.info("Reading filtered data:") answers = set() for encoded_line in open(NEWS_FILE_NAME_FILTERED): line = encoded_line.decode('utf-8') if line.count(" ") < 5: answers.add(line) LOGGER.info("There are %s 'answers' (sub-sentences)", len(answers)) LOGGER.info("Here are some examples:") for answer in itertools.islice(answers, 10): LOGGER.info(answer) with open(NEWS_FILE_NAME_SPLIT, "wb") as output_file: output_file.write("".join(answers).encode('utf-8')) def preprocess_partition_data(): """Set asside data for validation""" answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") print('shuffle', end=" ") random_shuffle(answers) print("Done") # Explicitly set apart 10% for validation data that we never train over split_at = len(answers) - len(answers) // 10 with open(NEWS_FILE_NAME_TRAIN, "wb") as output_file: output_file.write("\n".join(answers[:split_at]).encode('utf-8')) with open(NEWS_FILE_NAME_VALIDATE, "wb") as output_file: output_file.write("\n".join(answers[split_at:]).encode('utf-8')) def generate_question(answer): """Generate a question by adding noise""" question = add_noise_to_string(answer, AMOUNT_OF_NOISE) # Add padding: question += PADDING * (CONFIG.max_input_len - len(question)) answer += PADDING * (CONFIG.max_input_len - len(answer)) return question, answer def generate_news_data(): """Generate some news data""" print ("Generating Data") answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") questions = [] print('shuffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question, answer = generate_question(answer) answers[answer_index] = answer assert len(answer) == CONFIG.max_input_len if random_randint(100000) == 8: # Show some progress print (len(answers)) print ("answer: '{}'".format(answer)) print ("question: '{}'".format(question)) print () question = question[::-1] if CONFIG.inverted else question questions.append(question) return questions, answers def train_speller_w_all_data(): """Train the speller if all data fits into RAM""" questions, answers = generate_news_data() chars_answer = set.union(*(set(answer) for answer in answers)) chars_question = set.union(*(set(question) for question in questions)) chars = list(set.union(chars_answer, chars_question)) X_train, X_val, y_train, y_val, y_maxlen, ctable = vectorize(questions, answers, chars) print ("y_maxlen, chars", y_maxlen, "".join(chars)) model = generate_model(y_maxlen, chars) iterate_training(model, X_train, y_train, X_val, y_val, ctable) def train_speller(from_file=None): """Train the speller""" if from_file: model = load_model(from_file) else: model = generate_model(CONFIG.max_input_len, chars=read_top_chars()) itarative_train(model) if __name__ == '__main__': # download_the_news_data() # uncompress_data() # preprocesses_data_clean() # preprocesses_data_analyze_chars() # preprocesses_data_filter() # preprocesses_split_lines() --- Choose this step or: # preprocesses_split_lines2() # preprocesses_split_lines4() # preprocess_partition_data() # train_speller(os.path.join(DATA_FILES_FULL_PATH, "keras_spell_e15.h5")) train_speller()