示例#1
0
 def tokenize(self, text):
     if self.external:
         self.tokenizer.writeline(text.rstrip('\n'))
         return ([
             no_escaping(t)
             for t in self.tokenizer.readline().rstrip('\n').split()
         ])
     else:
         return self.tokenizer.tokenize(text, escape=False)
示例#2
0
 def tokenize(self, text):
     if self.external:
         if isinstance(text, list):
             return self.tokenize_block('\n'.join(text) + '\n').split('\n')
         else:
             self.tokenizer.writeline(text.rstrip('\n'))
             return ([no_escaping(t) for t in self.tokenizer.readline().rstrip('\n').split()])
     else:
         if isinstance(text, list):
             return [self.tokenizer.tokenize(line, escape=False) for line in text]
         else:
             return self.tokenizer.tokenize(text, escape=False)
示例#3
0
def feature_extract(srcsen, trgsen, tokenize_l, tokenize_r, args):
    length_ratio = args.length_ratio
    dict12 = args.dict_sl_tl
    dict21 = args.dict_tl_sl
    normalize_by_length = args.normalize_by_length
    qmax_limit = args.qmax_limit
    treat_oovs = args.treat_oovs
    disable_features_quest = args.disable_features_quest
    lang1 = args.source_lang
    lang2 = args.target_lang
    fv = args.features_version

    #    parts = row.strip().split("\t")

    #    if len(parts) == 1:
    #        parts.append("")

    # Sentence tokenization, with and without capital letters
    tokenize_l.writeline(srcsen.rstrip('\n'))
    tokenize_r.writeline(trgsen.rstrip('\n'))
    left_sentence_orig_tok = [
        no_escaping(t) for t in tokenize_l.readline().rstrip('\n').split()
    ][0:250]
    right_sentence_orig_tok = [
        no_escaping(t) for t in tokenize_r.readline().rstrip('\n').split()
    ][0:250]
    left_sentence_tok = [i.lower() for i in left_sentence_orig_tok]
    right_sentence_tok = [i.lower() for i in right_sentence_orig_tok]

    features = []

    features.append(feature_language(srcsen, lang1))
    features.append(feature_language(trgsen, lang2))
    features.append(feature_sentence_length(srcsen))
    features.append(feature_sentence_length(trgsen))
    features.extend(feature_character_class_dist(srcsen))
    features.extend(feature_character_class_dist(trgsen))
    features.extend(feature_character_measurements(srcsen))
    features.extend(feature_character_measurements(trgsen))
    features.append(feature_sentence_length(left_sentence_tok))
    features.append(feature_sentence_length(right_sentence_tok))
    features.append(
        feature_length_poisson(left_sentence_tok, right_sentence_tok,
                               length_ratio))
    features.append(
        feature_length_poisson(right_sentence_tok, left_sentence_tok,
                               1.0 / length_ratio))
    features.append(
        feature_dict_qmax(left_sentence_tok, right_sentence_tok, dict12,
                          normalize_by_length, treat_oovs, dict21, fv,
                          qmax_limit))
    features.extend(
        feature_dict_coverage(left_sentence_tok, right_sentence_tok, dict12))
    features.append(
        feature_dict_qmax(right_sentence_tok, left_sentence_tok, dict21,
                          normalize_by_length, treat_oovs, dict12, fv,
                          qmax_limit))
    features.extend(
        feature_dict_coverage(right_sentence_tok, left_sentence_tok, dict21))
    if disable_features_quest:
        # Average token length
        features.append(feature_avg_token_len(left_sentence_tok))
        features.append(feature_avg_token_len(right_sentence_tok))

        # Number of punctuation marks
        features.append(feature_num_punct_marks(left_sentence_tok))
        features.append(feature_num_punct_marks(right_sentence_tok))

        # Number of punctuation marks of each type: dot, comma, colon,
        # semicolon, double quotes, single quotes
        features.extend(feature_num_punct_marks_type(left_sentence_tok))
        features.extend(feature_num_punct_marks_type(right_sentence_tok))

        # Numeric expression preservation
        features.extend(
            feature_number_preservation(left_sentence_tok, right_sentence_tok))
        features.extend(
            feature_number_preservation(right_sentence_tok, left_sentence_tok))

        # Capitalized letter preservation
        features.extend(
            feature_capitalized_preservation(left_sentence_orig_tok,
                                             right_sentence_orig_tok))
        features.extend(
            feature_capitalized_preservation(left_sentence_orig_tok,
                                             right_sentence_orig_tok))

    return features
def write_sentences(args, source_sentence, target_sentence, output):
    # Undo XML escape of characters done by Moses before writing sentences
    output.write(no_escaping(source_sentence))
    output.write("\t")
    output.write(no_escaping(target_sentence))
    output.write("\n")