示例#1
0
def extract_ngram_features_and_tags(sentence,
                                    bigram_only=False,
                                    window_size=4,
                                    segmented=True):
    """
    Feature extraction for windowed approaches
    See Also https://github.com/chqiwang/convseg/
    Parameters
    ----------
    sentence
    bigram_only
    window_size
    segmented

    Returns
    -------

    """
    chars, tags = bmes_of(sentence, segmented)
    chars = CharTable.normalize_chars(chars)
    ret = []
    ret.append(chars)
    # TODO: optimize ngram generation using https://www.tensorflow.org/api_docs/python/tf/strings/ngrams
    ret.extend(extract_ngram_features(chars, bigram_only, window_size))
    ret.append(tags)
    return tuple(ret[:-1]), ret[-1]  # x, y
示例#2
0
 def inputs_to_samples(self, inputs, gold=False):
     for chars, tags in (inputs if gold else zip(inputs, [None] *
                                                 len(inputs))):
         if not gold:
             tags = [self.tag_vocab.safe_pad_token] * len(chars)
         chars = CharTable.normalize_chars(chars)
         yield chars, tags
示例#3
0
def clean_text(s):
    """
    1. normalize characters
    2. 按照规则去除特殊表达:
        1. html tags
        2. 章节名
        3. 括号 () 中所有内容
        4. 去除 url 链接
        5. 合并连续重复标点、字
    :param s: original text
    :return: cleaned text

    :reference:
        1. https://www.pythonf.cn/read/52608
        2. https://gist.github.com/gruber/8891611
    """
    new_text = CharTable.normalize_text(s)
    new_text = re.sub(r"<.*>|\(.*\)|第.*章.*| +", '',
                      new_text)  # remove html  & 括号 & 章节名 & 空格
    url_regex = re.compile(
        r"""(?i)\b((?:https?://|www\d{0,3}[。,,.]|[a-z0-9.\-]+[.。,,][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"""
    )
    new_text = re.sub(url_regex, "", new_text)
    new_text = re.sub("shuyaya.com|wap", "", new_text)  # adhoc data cleaning
    new_text = re.sub(r"[ #$%&`\'/@★\[\\\]^_{|}~]+", '',
                      new_text)  # remove special characters
    new_text = re.sub(",+", ",", new_text)
    new_text = re.sub("。。。+|\.\.\.+|…+", "…", new_text)  # 省略号 in HanLP
    new_text = re.sub("-+", "-", new_text)  # 合并-
    new_text = re.sub("—+", "-", new_text)  # 合并———
    new_text = re.sub("\++", "+", new_text)
    new_text = re.sub("\?+", "?", new_text)
    new_text = re.sub("!+", "!", new_text)
    # new_text = re.sub(r"([^\.])(\.)([^\.])", r"\1\3", new_text) # remove single dots

    # chinese_regex = re.compile(u'[⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]', re.UNICODE)
    # for chinese_character in re.findall(chinese_regex, new_text):
    #     new_text = re.sub("[{" + chinese_character + "}]{3,}", chinese_character * 3, new_text)  # 三个以上中文字符合并成三个
    return new_text
示例#4
0
 def inputs_to_samples(self, inputs, gold=False):
     for chars, tags in inputs:
         chars = CharTable.normalize_chars(chars)
         yield chars, tags