def extract_ngram_features_and_tags(sentence, bigram_only=False, window_size=4, segmented=True): """ Feature extraction for windowed approaches See Also https://github.com/chqiwang/convseg/ Parameters ---------- sentence bigram_only window_size segmented Returns ------- """ chars, tags = bmes_of(sentence, segmented) chars = CharTable.normalize_chars(chars) ret = [] ret.append(chars) # TODO: optimize ngram generation using https://www.tensorflow.org/api_docs/python/tf/strings/ngrams ret.extend(extract_ngram_features(chars, bigram_only, window_size)) ret.append(tags) return tuple(ret[:-1]), ret[-1] # x, y
def inputs_to_samples(self, inputs, gold=False): for chars, tags in (inputs if gold else zip(inputs, [None] * len(inputs))): if not gold: tags = [self.tag_vocab.safe_pad_token] * len(chars) chars = CharTable.normalize_chars(chars) yield chars, tags
def clean_text(s): """ 1. normalize characters 2. 按照规则去除特殊表达: 1. html tags 2. 章节名 3. 括号 () 中所有内容 4. 去除 url 链接 5. 合并连续重复标点、字 :param s: original text :return: cleaned text :reference: 1. https://www.pythonf.cn/read/52608 2. https://gist.github.com/gruber/8891611 """ new_text = CharTable.normalize_text(s) new_text = re.sub(r"<.*>|\(.*\)|第.*章.*| +", '', new_text) # remove html & 括号 & 章节名 & 空格 url_regex = re.compile( r"""(?i)\b((?:https?://|www\d{0,3}[。,,.]|[a-z0-9.\-]+[.。,,][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))""" ) new_text = re.sub(url_regex, "", new_text) new_text = re.sub("shuyaya.com|wap", "", new_text) # adhoc data cleaning new_text = re.sub(r"[ #$%&`\'/@★\[\\\]^_{|}~]+", '', new_text) # remove special characters new_text = re.sub(",+", ",", new_text) new_text = re.sub("。。。+|\.\.\.+|…+", "…", new_text) # 省略号 in HanLP new_text = re.sub("-+", "-", new_text) # 合并- new_text = re.sub("—+", "-", new_text) # 合并——— new_text = re.sub("\++", "+", new_text) new_text = re.sub("\?+", "?", new_text) new_text = re.sub("!+", "!", new_text) # new_text = re.sub(r"([^\.])(\.)([^\.])", r"\1\3", new_text) # remove single dots # chinese_regex = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE) # for chinese_character in re.findall(chinese_regex, new_text): # new_text = re.sub("[{" + chinese_character + "}]{3,}", chinese_character * 3, new_text) # 三个以上中文字符合并成三个 return new_text
def inputs_to_samples(self, inputs, gold=False): for chars, tags in inputs: chars = CharTable.normalize_chars(chars) yield chars, tags