예제 #1
0
 def _preprocess(self, raw_title, raw_text):
     """
     tokenize the input text and do some necessary process
     """
     if raw_title is None:
         raw_title = ""
     raw_title = raw_title.strip()
     # raw_title += (raw_title[-1] not in (".", "?", "!")) * "."
     if self.config.lower:
         raw_title = raw_title.lower()
         raw_text = raw_text.lower()
     title_tokens = meng17_tokenize(raw_title)
     text_tokens = meng17_tokenize(raw_text)
     tokens = title_tokens + ["."] + text_tokens
     if self.config.replace_digit:
         tokens = replace_numbers_to_DIGIT(tokens, k=2)
     return " ".join(tokens)
예제 #2
0
def heuristic_filter(src_token, tgts_token, tgts_str, opt):
    '''
    tokenize and truncate data, filter examples that exceed the length limit
    :param src_tgts_pairs:
    :param tokenize:
    :return:
    '''
    print('*' * 50)
    print('len(src)=%d, len(tgt)=%d' % (len(src_token), len(tgts_token)))
    print('src: %s' % str(src_token))
    print('tgt: %s' % str(tgts_token))
    print('*' * 50)

    # SOURCE FILTER: if length of src is over/under the given length limit, discard
    if opt.max_src_seq_length and len(src_token) > opt.max_src_seq_length:
        print("INVALID: source is too long [len=%d]: \n%s" % (len(src_token), str(src_token)))
        return False, None, None
    if opt.min_src_seq_length and len(src_token) < opt.min_src_seq_length:
        print("INVALID: source is too short [len=%d]: \n%s" % (len(src_token), str(src_token)))
        return False, None, None

    filtered_tgts_str = []
    filtered_tgts_token = []

    # Go over each keyphrase and check its validity
    for tgt_token, tgt_str in zip(tgts_token, tgts_str):
        tgt_token_for_filter = utils.meng17_tokenize(tgt_str)

        # FILTER 1: if length of tgt exceeds limit, discard
        if opt.max_tgt_seq_length and len(tgt_token_for_filter) > opt.max_tgt_seq_length:
            print("\tInvalid Target: target is too long: %s (originally %s)" % (str(tgt_token), tgt_str))
            continue
        if opt.min_tgt_seq_length and len(tgt_token_for_filter) < opt.min_tgt_seq_length:
            print("\tInvalid Target: target is too short: %s (originally %s)" % (str(tgt_token), tgt_str))
            continue

        # FILTER 2: ingore all the keyphrases that contains strange punctuations, very DIRTY data!
        punc_flag = False
        puncts = re.findall(r'[,_\"<>\(\){}\[\]\?~`!@$%\^=]', tgt_str)
        if len(puncts) > 0:
            print('-' * 50)
            print('Find punctuations in keyword: %s' % tgt_str)
            print('- tokens: %s' % str(tgt_token))
            punc_flag = True


        # FILTER 3: check the quality of long keyphrases (>5 words) with a heuristic rule, repeating meaningless words
        heuristic_rule_flag = False
        if len(tgt_token_for_filter) > 5:
            tgt_set = set(tgt_token_for_filter)
            if len(tgt_set) * 2 < len(tgt_token_for_filter):
                print('\t Invalid Target: heuristic_rule on long keyphrases (>5 words)')
                heuristic_rule_flag = True

        # FILTER 4: filter keywords like primary 75v05;secondary 76m10;65n30
        if (len(tgt_token_for_filter) > 0 and re.match(r'\d\d[a-zA-Z\-]\d\d', tgt_token_for_filter[0].strip())) \
                or (len(tgt_token_for_filter) > 1 and re.match(r'\d\d\w\d\d', tgt_token_for_filter[1].strip())):
            print('\tInvalid Target: matching template \d\d[a-z]\d\d: %s' % tgt_str)
            continue

        if (punc_flag or heuristic_rule_flag):
            if heuristic_rule_flag:
                print('\t Invalid Target: heuristic_rule on long keyphrases (>5 words)')
            if punc_flag:
                print('\t Invalid Target: found punctuation in keyphrases')
            continue

        filtered_tgts_str.append(tgt_str)
        filtered_tgts_token.append(tgt_token)

    # ignore the examples that have zero valid targets, for training they are no helpful
    if len(filtered_tgts_str) == 0:
        print('INVALID: found no valid targets')
        return False, None, None

    return True, filtered_tgts_token, filtered_tgts_str
예제 #3
0
        if opt.lower:
            title = title.lower()
            abstract = abstract.lower()
            keywords = [k.lower() for k in keywords]

        if opt.tokenizer == "str":
            title_token = [title]
            abstract_token = [abstract]
            keywords_token = keywords
        elif opt.tokenizer == "en_word":
            title_token = title.split(' ')
            abstract_token = abstract.split(' ')
            keywords_token = [kw.split(' ') for kw in keywords]
        elif opt.tokenizer == "meng17":
            title_token = utils.meng17_tokenize(title)
            abstract_token = utils.meng17_tokenize(abstract)
            keywords_token = [utils.meng17_tokenize(kw) for kw in keywords]
        elif opt.tokenizer == "en_retain_punc":
            title_token = utils.retain_punc_tokenize(title)
            abstract_token = utils.retain_punc_tokenize(abstract)
            keywords_token = [utils.retain_punc_tokenize(kw) for kw in keywords]
        elif opt.tokenizer == "en_subword":
            raise NotImplementedError
        else:
            raise NotImplementedError

        if opt.replace_digit:
            title_token = utils.replace_numbers_to_DIGIT(title_token, k=2)
            abstract_token = utils.replace_numbers_to_DIGIT(abstract_token, k=2)
            keywords_token = [utils.replace_numbers_to_DIGIT(kw, k=2) for kw in keywords_token]