def tokenize(self, text): if self.external: self.tokenizer.writeline(text.rstrip('\n')) return ([ no_escaping(t) for t in self.tokenizer.readline().rstrip('\n').split() ]) else: return self.tokenizer.tokenize(text, escape=False)
def tokenize(self, text): if self.external: if isinstance(text, list): return self.tokenize_block('\n'.join(text) + '\n').split('\n') else: self.tokenizer.writeline(text.rstrip('\n')) return ([no_escaping(t) for t in self.tokenizer.readline().rstrip('\n').split()]) else: if isinstance(text, list): return [self.tokenizer.tokenize(line, escape=False) for line in text] else: return self.tokenizer.tokenize(text, escape=False)
def feature_extract(srcsen, trgsen, tokenize_l, tokenize_r, args): length_ratio = args.length_ratio dict12 = args.dict_sl_tl dict21 = args.dict_tl_sl normalize_by_length = args.normalize_by_length qmax_limit = args.qmax_limit treat_oovs = args.treat_oovs disable_features_quest = args.disable_features_quest lang1 = args.source_lang lang2 = args.target_lang fv = args.features_version # parts = row.strip().split("\t") # if len(parts) == 1: # parts.append("") # Sentence tokenization, with and without capital letters tokenize_l.writeline(srcsen.rstrip('\n')) tokenize_r.writeline(trgsen.rstrip('\n')) left_sentence_orig_tok = [ no_escaping(t) for t in tokenize_l.readline().rstrip('\n').split() ][0:250] right_sentence_orig_tok = [ no_escaping(t) for t in tokenize_r.readline().rstrip('\n').split() ][0:250] left_sentence_tok = [i.lower() for i in left_sentence_orig_tok] right_sentence_tok = [i.lower() for i in right_sentence_orig_tok] features = [] features.append(feature_language(srcsen, lang1)) features.append(feature_language(trgsen, lang2)) features.append(feature_sentence_length(srcsen)) features.append(feature_sentence_length(trgsen)) features.extend(feature_character_class_dist(srcsen)) features.extend(feature_character_class_dist(trgsen)) features.extend(feature_character_measurements(srcsen)) features.extend(feature_character_measurements(trgsen)) features.append(feature_sentence_length(left_sentence_tok)) features.append(feature_sentence_length(right_sentence_tok)) features.append( feature_length_poisson(left_sentence_tok, right_sentence_tok, length_ratio)) features.append( feature_length_poisson(right_sentence_tok, left_sentence_tok, 1.0 / length_ratio)) features.append( feature_dict_qmax(left_sentence_tok, right_sentence_tok, dict12, normalize_by_length, treat_oovs, dict21, fv, qmax_limit)) features.extend( feature_dict_coverage(left_sentence_tok, right_sentence_tok, dict12)) features.append( feature_dict_qmax(right_sentence_tok, left_sentence_tok, dict21, normalize_by_length, treat_oovs, dict12, fv, qmax_limit)) features.extend( feature_dict_coverage(right_sentence_tok, left_sentence_tok, dict21)) if disable_features_quest: # Average token length features.append(feature_avg_token_len(left_sentence_tok)) features.append(feature_avg_token_len(right_sentence_tok)) # Number of punctuation marks features.append(feature_num_punct_marks(left_sentence_tok)) features.append(feature_num_punct_marks(right_sentence_tok)) # Number of punctuation marks of each type: dot, comma, colon, # semicolon, double quotes, single quotes features.extend(feature_num_punct_marks_type(left_sentence_tok)) features.extend(feature_num_punct_marks_type(right_sentence_tok)) # Numeric expression preservation features.extend( feature_number_preservation(left_sentence_tok, right_sentence_tok)) features.extend( feature_number_preservation(right_sentence_tok, left_sentence_tok)) # Capitalized letter preservation features.extend( feature_capitalized_preservation(left_sentence_orig_tok, right_sentence_orig_tok)) features.extend( feature_capitalized_preservation(left_sentence_orig_tok, right_sentence_orig_tok)) return features
def write_sentences(args, source_sentence, target_sentence, output): # Undo XML escape of characters done by Moses before writing sentences output.write(no_escaping(source_sentence)) output.write("\t") output.write(no_escaping(target_sentence)) output.write("\n")