"they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"] stopwords = set(stopwords) OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"} # moses tokenizer from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser mtok = MosesTokenizer(lang='en') mtr = MosesTruecaser("vocab/truecase-model.en") md = MosesDetokenizer(lang="en") mdtr = MosesDetruecaser() # bpe tokenizer from subword_nmt.apply_bpe import BPE, read_vocabulary vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10) bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary) # load nmt models import onmt.opts from translator_for_demo import build_translator from onmt.utils.parse import ArgumentParser def _parse_opt(opt): prec_argv = sys.argv
with open(tgt_file, 'r') as f: targets = f.readlines() assert len(sources) == len(targets) print('Loaded', len(sources), 'sentences') return sources, targets print("Loading vocab...") src_vocab, tgt_vocab = d.load_vocab(src_lang, tgt_lang) d.SRC.vocab = src_vocab d.TGT.vocab = tgt_vocab src_pad_key = d.SRC.vocab.stoi[d.BLANK_WORD] tgt_pad_key = d.TGT.vocab.stoi[d.BLANK_WORD] mtok = MosesDetokenizer(lang=tgt_lang) print("Loading data...") sources, targets = load_data(test_year=TEST_YEAR) print('Loading model ...') model = Seq2SeqModel.load(src_lang=src_lang, tgt_lang=tgt_lang, epoch=20) print('Starting test...') i = 0 translations = [] references = [] with torch.no_grad(): for src_text, tgt_text in zip(sources, targets):
def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '): self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.src_bpe = src_bpe self.use_cuda = torch.cuda.is_available() and not args.cpu self.args = args # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if self.use_cuda: model.cuda() self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(args.replace_unk) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) self.in_transforms = [] self.out_transforms = [] if getattr(args, 'moses', False): tokenizer = MosesTokenizer(lang=args.source_lang or 'en') detokenizer = MosesDetokenizer(lang=args.target_lang or 'en') self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True)) self.out_transforms.append(lambda s: detokenizer.detokenize(s.split())) elif getattr(args, 'nltk', False): from nltk.tokenize import word_tokenize self.in_transforms.append(lambda s: ' '.join(word_tokenize(s))) if getattr(args, 'gpt2_bpe', False): from fairseq.gpt2_bpe.gpt2_encoding import get_encoder encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json') vocab_bpe = src_bpe encoder = get_encoder(encoder_json, vocab_bpe) self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s)))) self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>')) self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split()))) elif getattr(args, 'sentencepiece', False): import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(src_bpe) self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s))) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece')) elif src_bpe is not None: bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe]) bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries) self.in_transforms.append(lambda s: bpe.process_line(s)) self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
class BITETokenizer(object): inflection_tokens = [ "[JJR]", "[JJS]", "[NNS]", "[NNPS]", "[RBR]", "[RBS]", "[VBD]", "[VBG]", "[VBN]", "[VBP]", "[VBZ]" ] single_char_map = { "[JJR]": chr(9774), "[JJS]": chr(9775), "[NNS]": chr(9776), "[NNPS]": chr(9777), "[RBR]": chr(9778), "[RBS]": chr(9779), "[VBD]": chr(9780), "[VBG]": chr(9781), "[VBN]": chr(9782), "[VBP]": chr(9783), "[VBZ]": chr(9784) } reverse_single_char_map = {v: k for k, v in single_char_map.items()} lemma_tags = {'NN', 'VB', 'JJ', 'RB', 'MD', "NNP"} have_inflections = {'NOUN', 'ADJ', 'VERB'} def __init__(self, pretokenizer='moses'): self.tagger = PerceptronTagger() self.pretok_type = pretokenizer if pretokenizer == 'bertpretokenizer': self.pretokenizer = BertPreTokenizer() elif pretokenizer == 'moses': self.pretokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() elif pretokenizer == 'whitespace': pass else: raise ValueError( "pretokenizer must be 'bertpretokenizer', 'moses', or 'whitespace'." ) def _pretokenize(self, sentence: str) -> List[str]: if self.pretok_type == 'bertpretokenizer': return [tup[0] for tup in self.pretokenizer.pre_tokenize(sentence)] elif self.pretok_type == 'whitespace': return sentence.split() else: return self.pretokenizer.tokenize(sentence) def tokenize(self, sentence: Union[str, List[str]], pretokenize: bool = True, map_to_single_char: bool = False) -> List[str]: if pretokenize: pretokenized = self._pretokenize(sentence) else: # Allow users to pass in a list of tokens if using custom pretokenizers pretokenized = sentence ptb_pos_tagged = self.tagger.tag(pretokenized) universal_pos_tagged = [(token, map_tag("en-ptb", 'universal', tag)) for (token, tag) in ptb_pos_tagged] tokenized = [] for i, (word, pos) in enumerate(ptb_pos_tagged): if universal_pos_tagged[i][ 1] in self.have_inflections and word not in ( string.punctuation + '—') and pos not in self.lemma_tags: lemma = getLemma(word, upos=universal_pos_tagged[i][1])[0] if not lemma: lemma = word tokenized.append(lemma) tokenized.append('[' + pos + ']') else: tokenized.append(word) if map_to_single_char: tokenized = [ self.single_char_map[token] if token in self.inflection_tokens else token for token in tokenized ] return tokenized def detokenize(self, tokens: List[str], as_list: bool = False) -> Union[str, List[str]]: result = [] for i, token in enumerate(tokens): # combine wordpiece tokens if token in self.reverse_single_char_map: token = self.reverse_single_char_map[token] if token in self.inflection_tokens: if i != 0: inflected = getInflection(result[-1], tag=token[1:-1]) if inflected: result[-1] = inflected[0] else: result.append(token) if as_list: # Allow users to detokenize using their own detokenizers return result if self.pretok_type == 'moses': return self.detokenizer.detokenize(result) return ' '.join(result)
import torch import jsondiff from allennlp.common.checks import ConfigurationError from allennlp.common.params import Params from sacremoses import MosesDetokenizer from .config import Params logger = logging.getLogger(__name__) # pylint: disable=invalid-name SOS_TOK, EOS_TOK = "<SOS>", "<EOS>" # Note: using the full 'detokenize()' method is not recommended, since it does # a poor job of adding correct whitespace. Use unescape_xml() only. _MOSES_DETOKENIZER = MosesDetokenizer() def get_output_attribute(out, attribute_name, cuda_device, reduction="sum"): """ This function handles processing/reduction of output for both DataParallel or non-DataParallel situations. For the case of multiple GPUs, This function will sum all replacers for a certain output attribute in various batches together. Parameters --------------------- :param out: Dictionary, output of model during forward pass, :param attribute_name: str, :param cuda_device: list or int
class Tokenizer(object): def __init__(self, vocab_file=None, additional_tokens=None, use_moses=None): self.special_tokens = [PAD_TOKEN, UNK_TOKEN, BOS_TOKEN, EOS_TOKEN] if use_moses is not None: self.enable_moses(lang=use_moses) if additional_tokens is not None: self.special_tokens += additional_tokens self.__word2idx = {} self.vocab_file = vocab_file if os.path.isfile(vocab_file): self.load_vocab(vocab_file) def enable_moses(self, lang='en', tokenize=True, detokenize=True): if tokenize: self._moses_tok = MosesTokenizer(lang=lang) else: self._moses_tok = None if detokenize: self._moses_detok = MosesDetokenizer(lang=lang) else: self._moses_detok = None @property def vocab_size(self): return len(self.vocab) + len(self.special_tokens) def pre_tokenize(self, line): if hasattr(self, '_moses_tok'): return self._moses_tok.tokenize(line, return_str=True) return line def post_detokenize(self, tokens): if hasattr(self, '_moses_detok'): return self._moses_detok.detokenize(tokens, return_str=False) return tokens def idx2word(self, idx): if idx < len(self.special_tokens): return self.special_tokens[idx] else: return self.vocab[idx - len(self.special_tokens)][0] def update_word2idx(self): self.__word2idx = { word[0]: idx + len(self.special_tokens) for idx, word in enumerate(self.vocab) } for i, tok in enumerate(self.special_tokens): self.__word2idx[tok] = i def word2idx(self, word): return self.__word2idx.get(word, UNK) def segment(self, line, sample=None): """segments a line to tokenizable items""" line = self.pre_tokenize(line) return _segment_words(line) def get_vocab(self, item_list, from_filenames=True, limit=None): vocab = _get_vocabulary(item_list=item_list, segment=self.segment, from_filenames=from_filenames) self.vocab = vocab.most_common(limit) self.update_word2idx() def save_vocab(self, vocab_filename): if self.vocab is not None: with codecs.open(vocab_filename, 'w', encoding='UTF-8') as f: for (key, freq) in self.vocab: f.write("{0} {1}\n".format(key, freq)) def load_vocab(self, vocab_filename, limit=None, min_count=1): vocab = OrderedCounter() with codecs.open(vocab_filename, encoding='UTF-8') as f: for line in f: try: word, count = line.strip().split() except: # no count word, count = line.strip(), 1 count = int(count) if count >= min_count: vocab[word] = count self.vocab = vocab.most_common(limit) self.update_word2idx() def tokenize(self, line, insert_start=None, insert_end=None, sample=None): """tokenize a line, insert_start and insert_end are lists of tokens""" inputs = self.segment(line) targets = [] if insert_start is not None: targets += insert_start for w in inputs: targets.append(self.word2idx(w)) if insert_end is not None: targets += insert_end return torch.LongTensor(targets) def detokenize(self, inputs, delimiter=u' '): token_list = [self.idx2word(int(idx)) for idx in inputs] token_list = self.post_detokenize(token_list) outputs = delimiter.join(token_list) return outputs
def load_model(model_dir, bpe_src_code=None, tokenize=None): """ Start the bot. This means loading the model according to the config file. :param model_dir: Model directory of trained Joey NMT model. :param bpe_src_code: BPE codes for source side processing (optional). :param tokenize: If True, tokenize inputs with Moses tokenizer. :return: """ conf = {} cfg_file = model_dir + "/config.yaml" logger = logging.getLogger(__name__) conf["logger"] = logger # load the Joey configuration cfg = load_config(cfg_file) # load the checkpoint if "load_model" in cfg['training'].keys(): ckpt = cfg['training']["load_model"] else: ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError("No checkpoint found in directory {}." .format(model_dir)) # prediction parameters from config conf["use_cuda"] = cfg["training"].get("use_cuda", False) conf["level"] = cfg["data"]["level"] conf["max_output_length"] = cfg["training"].get("max_output_length", None) conf["lowercase"] = cfg["data"].get("lowercase", False) # load the vocabularies src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, dataset=None, max_size=-1, min_freq=0) conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, dataset=None, max_size=-1, min_freq=0) # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): conf["beam_size"] = cfg["testing"].get("beam_size", 0) conf["beam_alpha"] = cfg["testing"].get("alpha", -1) else: conf["beam_size"] = 1 conf["beam_alpha"] = -1 # pre-processing if tokenize is not None: src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) # tokenize input def tokenizer(x): return src_tokenizer.tokenize(x, return_str=True) def detokenizer(x): return trg_tokenizer.detokenize( x.split(), return_str=True) else: def tokenizer(x): return x def detokenizer(x): return x if bpe_src_code is not None and level == "bpe": # load bpe merge file merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) def segmenter(x): return bpe.process_line(x.strip()) elif conf["level"] == "char": # split to chars def segmenter(x): return list(x.strip()) else: def segmenter(x): return x.strip() conf["preprocess"] = [tokenizer, segmenter] conf["postprocess"] = [detokenizer] # build model and load parameters into it model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) model = build_model( cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) model.load_state_dict(model_checkpoint["model_state"]) if conf["use_cuda"]: model.cuda() conf["model"] = model print("Joey NMT model loaded successfully.") return conf
class LegalDoc: # ----Constant Static fields---- # Patterns __SENTENCING_IDENTIFIER_PATTERN = re.compile(r".+(DATE OF SENTENCE:)", re.S | re.M) __EMPTY_LINE_PATTERN = re.compile(r"^[\s\t\n\r]*$") __FILE_SECTION_PATTERN = re.compile(r"Section:[0-9]+") __SECTION_PATTERN = re.compile(r"([0-9]+)[.(\s\t]*([A-Z].+)") __DOCUMENT_PATTERN = re.compile( r"(.+?)" + # Head r"(" # Capture Body start r"(?:^1[.\t\s]*[A-Z])" + # 1st section number and 1st capital letter r"(?:.+)" # Body sans above capture r")", # Capture Body End re.S | re.M) __CASE_NUMBER_PATTERN = re.compile( r"^.+" + r"(?:" + r"(?:AP|CR)" + # E.g. "CR" r"|" + r"(?:Case No(?:[.\s\tA-Za-z]*))" + # E.g. "Case No. X" r")" + r"([0-9\s-]+[0-9])" + # E.g. "-12-34567" r"[\s\t]*$", +re.S | re.M) __DEFENDANT_NAME_PATTERN = re.compile( r".+" + r"(?:^\|[\s]+[vV][\s]+\|$)" + # E.g. "| v |" r"(?:[-\s]+)" + # E.g. "------" r"^\|[\s]+" # E.g. "| " r"([A-Za-z\s-]+)", # E.g. "John Smith" re.S | re.M) __JUDGE_NAME_PATTERN = re.compile( r".+JUDGE:[\s|]+" + # E.g. "JUDGE: |: r"(?:(?:HIS|HER)[\s].+[\s]+JUDGE?)?" + # E.g. "HIS HONOUR CHIEF JUDGE" r"([a-zA-Z\s.']+)\|", # E.g. "J. Smith" re.S | re.M | re.I) NAME_SIMPLIFIER_PATTERN = re.compile(r"([A-Z][A-Za-z]+)") SECTION_IDENTIFIER_PATTERN = re.compile(r"(SECTIONSTART[0-9]+:?\s?)") # Settings ANONYMIZE_NAMES: bool = False CLEAN_DATA: bool = False REMOVE_PUNCTUATION: bool = False REMOVE_STOP_WORDS: bool = False APPLY_STEMMING: bool = False APPLY_LEMMATIZATION: bool = False TO_LOWER_CASE: bool = False # ~95% success rate if False but ~20% of LegalDocs will have # a generated case number and/or the sections will lack structure. # Otherwise, ~75% success rate if True EXIT_IF_ERRORS: bool = True # Singletons MONTHS = [ "January", "February", "March", "April", "May" "June", "July", "August", "September", "October", "November", "December" ] SENTENCE_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle') STOP_WORDS = set(stopwords.words("english")) STEMMER = PorterStemmer() LEMMATIZER = WordNetLemmatizer() DETOKENIZER = MosesDetokenizer() TFIDF_VECTORIZER = TfidfVectorizer() NAMES = sorted( list( set( nltk.corpus.names.words('male.txt') + nltk.corpus.names.words('female.txt')))) # ---- Static fields---- s_successful_init_count: int = 0 s_failed_init_count: int = 0 s_exception_data_dict: dict = dict() s_legal_doc_dict: dict = dict() s_file_error: int = 0 s_case_error: int = 0 s_judge_error: int = 0 s_defendant_error: int = 0 s_sentencing_error: int = 0 s_section_error: int = 0 s_body_error: int = 0 s_parsing_error: int = 0 s_unknown_error: int = 0 # ----Constructor---- def __init__(self): # Initialise fields self.__f_path = "NULL" self.__f_file_name = "NULL" self.__f_head = "NULL" self.__f_body = [] self.__f_case_number = "NULL" self.__f_judge_name = "NULL" self.__f_defendant_name = "NULL" self.__f_corpus = [] self.__f_sentencing_document = False self.__f_parsing_error = False self.__f_punctuation_removed = False self.__f_lower_case = False self.__f_stop_words_removed = False self.__f_stemmed = False self.__f_lemmatized = False self.__f_contains_errors = False self.__f_tokenized_sentences = False # ----Instance methods---- # Initialise def initialise(self, a_path, load_state): """" Initialises a LegalDoc instance from a file Separated from __init__ to avoid exceptions in the constructor This method must be executed after construction :param str a_path: The path to a legal document :param bool load_state: Whether the provided path points to a formatted file (true) or an unformatted file (false) :rtype: bool :return: Whether an instance was successfully generated from file at the provided path """ # Initialise path self.__f_path = a_path # Get file name head, tail = ntpath.split(self.path) self.__f_file_name = (tail or ntpath.basename(head)) # Read in file l_file = None try: l_file = open(self.path) l_file_content = l_file.read() l_file.close() # Handle file error except IOError: LegalDoc.__note_exception(self.path, "MAJOR ERROR: Unable to read file", True) l_file.close() return False # Load state from a formatted file if load_state: # TODO - Timer start Timers.s_init_load_state_timer.start() l_succeeded = self.__initialise_load_state(l_file_content) # TODO - Timer stop Timers.s_init_load_state_timer.stop() if not l_succeeded: LegalDoc.s_file_error += 1 return False # Generate state from an unformatted file else: # TODO - Timer start Timers.s_init_gen_state_timer.start() l_succeeded = self.__initialise_generate_state(l_file_content) # TODO - Timer stop Timers.s_init_gen_state_timer.stop() if not l_succeeded: return False self.__f_punctuation_removed = LegalDoc.REMOVE_PUNCTUATION self.__f_lower_case = LegalDoc.TO_LOWER_CASE self.__f_stop_words_removed = LegalDoc.REMOVE_STOP_WORDS self.__f_stemmed = LegalDoc.APPLY_STEMMING self.__f_lemmatized = LegalDoc.APPLY_LEMMATIZATION # Note successful initialisation LegalDoc.s_successful_init_count += 1 # Add current LegalDoc to static dictionary of LegalDocs LegalDoc.s_legal_doc_dict[self.file_name] = self # Create judge and add it to static dictionary of judges Judge.add_legal_doc(self) return True # Load state from a formatted file def __initialise_load_state(self, a_file_content): """" Initialises a LegalDoc instance from a formatted file :type a_file_content: str :rtype: bool :return: Whether an instance was successfully generated from the source file content """ try: l_lines = a_file_content.splitlines() i = 0 # Verify this is a formatted LegalDOc if l_lines[0] == "FIELD DATA:": i += 1 # Read in field data while l_lines[i] != "SECTIONS:": l_line = l_lines[i].strip() # File name if l_line == "FILE NAME:": self.__f_file_name = l_lines[i + 1].strip() i += 2 continue # Case number if l_line == "CASE NUMBER:": self.__f_case_number = l_lines[i + 1].strip() i += 2 continue # Judge name if l_line == "JUDGE NAME:": self.__f_judge_name = l_lines[i + 1].strip() i += 2 continue # Defendant name if l_line == "DEFENDANT NAME:": self.__f_defendant_name = l_lines[i + 1].strip() i += 2 continue # Sentencing document if l_line == "PRISON DOCUMENT:": self.__f_sentencing_document = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Punctuation removed if l_line == "PUNCTUATION REMOVED:": self.__f_punctuation_removed = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Stop words removed if l_line == "STOP WORDS REMOVED:": self.__f_stop_words_removed = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Lower case if l_line == "LOWER CASE:": self.__f_lower_case = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Stemmed if l_line == "STEMMED:": self.__f_stemmed = ast.literal_eval(l_lines[i + 1].strip()) i += 2 continue # Lemmatized if l_line == "LEMMATIZED:": self.__f_lemmatized = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Contains errors if l_line == "CONTAINS ERRORS:": self.__f_contains_errors = ast.literal_eval( l_lines[i + 1].strip()) i += 2 continue # Skip line i += 1 # Read in section data l_section_index = -1 i += 1 while i < len(l_lines): l_line = l_lines[i].strip() # Reading a section heading if LegalDoc.__FILE_SECTION_PATTERN.match(l_line): l_section_index += 1 self.__f_body.append([]) # Reading a section's contents else: self.__f_body[l_section_index].append(l_line) i += 1 # Create corpora for l_section in self.body: for l_sentence in l_section: self.__f_corpus += word_tokenize(l_sentence) return True except IndexError: LegalDoc.__note_exception( self.path, "MAJOR ERROR: Failed to import formatted file, index out of bounds", True) return False # Generate state from an unformatted file def __initialise_generate_state(self, a_file_content): """" Initialises a LegalDoc instance from an unformatted file :type a_file_content: str :rtype: bool :return: Whether an instance was successfully generated from the source file content """ try: # Break up document into base components l_document_match = LegalDoc.__DOCUMENT_PATTERN.match( a_file_content) if l_document_match: l_document_groups = l_document_match.groups() # Handle document parsing error else: LegalDoc.__note_exception( self.path, "MAJOR ERROR: Regex cannot parse document", True) LegalDoc.s_parsing_error += 1 return False # Extract head self.__f_head = l_document_groups[0] # Extract sentencing identifier if not self.__extract_sentencing_identifier(): return False # Extract case number if not self.__extract_case_number(): return False # Extract defendant's name if not self.__extract_defendant_name(): return False # Extract judge's name if not self.__extract_judge_name(): return False # Group lines into sections l_lines = l_document_groups[1].splitlines( ) # Body broken down by line if not self.__group_lines_into_sections(l_lines): return False # Anonymize names self.__anonymize_names() # Clean sections self.__clean_sections() # Initialisation completed with no errors return True # Handle miscellaneous errors except Exception: LegalDoc.__note_exception( self.path, "MAJOR ERROR: Unspecified error occurred", True) LegalDoc.s_unknown_error += 1 raise return False # Anonymize names def __anonymize_names(self): """" Change's every instance of the defendant's name to "Defandant" Encrypts the judge's name Assigns a random name to everybody else """ if LegalDoc.ANONYMIZE_NAMES: # TODO - Timer start Timers.s_anonymize_names_timer.start() # Generate corpus self.generate_corpus_from_sections() # TODO - Timer start Timers.s_anonymization_timer.start() # Get list of names l_filtered_corpus = [w for w in self.corpus if w[0].isupper()] # print(l_filtered_corpus) # print(l_filtered_corpus) # l_names = [w for w in LegalDoc.NAMES if w in l_filtered_corpus] l_names = [] for w in l_filtered_corpus: i = LegalDoc.index(LegalDoc.NAMES, w) if i is not None: l_names.append(LegalDoc.NAMES[i]) # TODO - Timer stop Timers.s_anonymization_timer.stop() # print("Namesxxx: " + str(l_names)) # print("All Names: " + str(sorted(LegalDoc.NAMES))) # Create a random name dictionary l_random_names = dict() for l_name in l_names: l_random_index = random.randint(0, len(LegalDoc.NAMES) - 1) l_random_names[l_name] = LegalDoc.NAMES[l_random_index] # Anonymize names for i, l_word in enumerate(self.corpus): try: if l_word in self.defendant_name: if self.corpus[i - 1] == "Defendant": del self.corpus[i] else: self.corpus[i] = "Defendant" elif l_word in self.judge_name: if self.corpus[i - 1] == "Judge": del self.corpus[i] else: self.corpus[i] = "Judge" elif l_word in l_names and l_word not in LegalDoc.MONTHS: self.corpus[i] = l_random_names[l_word] except IndexError: print("FAIL") continue self.generate_sections_from_corpus() # TODO - Timer stop Timers.s_anonymize_names_timer.stop() def __extract_sentencing_identifier(self): """" Extracts the sentencing identifier from this legal document's head sets the value of "__f_sentencing_document" as a bool :rtype: bool :return: Whether the sentencing identifier was successfully extracted and set """ # Extract sentencing identifier l_sentencing_identifier_match = LegalDoc.__SENTENCING_IDENTIFIER_PATTERN.match( self.__f_head) if l_sentencing_identifier_match: self.__f_sentencing_document = True return True # Handle non sentencing document else: LegalDoc.__note_exception( self.path, "MAJOR ERROR: This is not a sentencing document", True) LegalDoc.s_sentencing_error += 1 return False # Extract case number def __extract_case_number(self): """" Extracts the case number from this legal document's head and cleans it sets the value of "l_case_num_match" as a string :rtype: bool :return: Whether the case number was successfully extracted and set """ # Extract case number l_case_num_match = LegalDoc.__CASE_NUMBER_PATTERN.match(self.__f_head) if l_case_num_match: # Extract case number whilst removing dashes, spaces and tabs self.__f_case_number = (l_case_num_match.groups())[0].translate( {ord(c): None for c in r'- '}) return True # Handle failure to find a case number else: LegalDoc.__note_exception(self.path, "ERROR: Unable to find case number", LegalDoc.EXIT_IF_ERRORS) LegalDoc.s_case_error += 1 if LegalDoc.EXIT_IF_ERRORS: return False # Extract defendant's name def __extract_defendant_name(self): """" Extracts the defendant's name from this legal document's head Removes initials from the defendant's name Ensure the defendant's name is all lower case with the exception of the first letter sets the value of "__f_defendant_name" as a set of strings (e.g. "John Smith" becomes {"John", "Smith"}) :rtype: bool :return: Whether the defendant's name was successfully extracted and set """ l_defendant_name_match = LegalDoc.__DEFENDANT_NAME_PATTERN.match( self.head) # Check for regex match if l_defendant_name_match: # Clean name and set value of "__f_defendant_name" self.__f_defendant_name = ( l_defendant_name_match.groups())[0].strip() self.__f_defendant_name = LegalDoc.NAME_SIMPLIFIER_PATTERN.findall( self.defendant_name) self.__f_defendant_name = set( [x.lower().capitalize() for x in self.defendant_name]) return True # Handle inability to determine defendant's name else: LegalDoc.__note_exception( self.path, "MAJOR ERROR: Unable to find defendant's name", True) LegalDoc.s_defendant_error += 1 return False # Extract judge's name def __extract_judge_name(self): """" Extracts the judge's name from this legal document's head Removes initials from the judge's name Ensure the judge's name is all lower case with the exception of the first letter sets the value of "__f_judge_name" as a set of strings (e.g. "John Smith" becomes {"John", "Smith"}) :rtype: bool :return: Whether the judge's name was successfully extracted and set """ # Check for regex match l_judge_name_match = LegalDoc.__JUDGE_NAME_PATTERN.match(self.head) if l_judge_name_match: # Clean name and set value of "__f_judge_name" self.__f_judge_name = (l_judge_name_match.groups())[0].strip() self.__f_judge_name = LegalDoc.NAME_SIMPLIFIER_PATTERN.findall( self.judge_name) self.__f_judge_name = set( [x.lower().capitalize() for x in self.judge_name]) return True # Handle inability to determine judge's name else: LegalDoc.__note_exception( self.path, "MAJOR ERROR: Unable to find judge's name", True) LegalDoc.s_judge_error += 1 return False # Tokenize each sentence in each section in the body def __group_lines_into_sections(self, a_lines): """" Groups the provided list of lines into sections comprised of sentences :type a_lines: list :rtype: bool :return: Whether the lines were successfully grouped into sections """ l_sections = [] # Group lines into sections try: l_section_index = 0 # Used to check whether sections are being missed l_sections.append( "" ) # l_sections[0] catches any lines prior to the first section l_bad_sections = False # True if any problems are encountered whilst parsing sections # For each line in l_lines... for l_line in a_lines: l_section_match = LegalDoc.__SECTION_PATTERN.match(l_line) # Check if the line contains the start of a section if l_section_match: # Remove the section number from the line l_line = LegalDoc.__SECTION_PATTERN.sub( r"\g<2>", l_line, 1) # If the section number in the line matches the l_section_index if l_section_match[1] == str(l_section_index + 1): l_section_index += 1 l_section = "SECTIONSTART" + str( l_section_index) + ":\t" + l_line l_sections.append(l_section) # A parsing error has occurred else: l_bad_sections = True l_sections[l_section_index] += l_line # Check if the line is empty elif LegalDoc.__EMPTY_LINE_PATTERN.match(l_line): continue # This line is not the start of a section nor is it empty else: # The line is part of a section if l_section_index > 0: l_sections[l_section_index] += l_line # This line is prior to all sections. Add it to section 0 else: l_bad_sections = True l_sections[l_section_index] += l_line # Handle section parsing errors if l_bad_sections: self.__f_contains_errors = True LegalDoc.__note_exception(self.path, "ERROR: Bad section(s)", LegalDoc.EXIT_IF_ERRORS) LegalDoc.s_section_error += 1 if LegalDoc.EXIT_IF_ERRORS: return False # Break up sections into sentences # Add N sentence arrays to body, where N is the number of sections for l_section in l_sections: self.body.append( LegalDoc.SENTENCE_TOKENIZER.tokenize(l_section)) return True # Handle failure to parse document's body except (TypeError, AttributeError, IndexError): LegalDoc.__note_exception( self.path, "MAJOR ERROR: Unable to break down body", True) LegalDoc.s_body_error += 1 return False # Tokenize each sentence in each section in the body def tokenize_sentences(self): # Check if sentences are already tokenized if not self.tokenized_sentences: l_tokenized_sections = [] # By section for l_section in self.body: l_tokenized_sentences = [] # By sentence for l_sentence in l_section: # Add tokenized sentences to section l_tokenized_sentences.append(word_tokenize(l_sentence)) # Add tokenized sections to sections list l_tokenized_sections.append(l_tokenized_sentences) # Update body self.__f_body = l_tokenized_sections self.__f_tokenized_sentences = True # Detokenize each sentence in each section in the body def detokenize_sentences(self): # Check if sentences are already tokenized if self.tokenized_sentences: l_detokenized_sections = [] # By section for l_section in self.body: l_detokenized_sentences = [] # By sentence for l_sentence in l_section: # Add detokenized sentences to section l_detokenized_sentences.append( LegalDoc.DETOKENIZER.detokenize(l_sentence, return_str=True)) # Add detokenized sections to sections list l_detokenized_sections.append(l_detokenized_sentences) # Update body self.__f_body = l_detokenized_sections self.__f_tokenized_sentences = False # Generates untokenized sections from the words in the corpus def generate_sections_from_corpus(self): """" This method will not work properly if punctuation has been removed or if all words have been lower cased """ # TODO - Timer start Timers.s_gen_secs_from_corpus_timer.start() self.__f_body = [] l_section_words = [] for l_word in self.corpus: if LegalDoc.SECTION_IDENTIFIER_PATTERN.match(l_word): # Detokenize section words list into a string l_detokenized_section = LegalDoc.DETOKENIZER.detokenize( l_section_words, return_str=True) # Tokenize section string into sentences (a list of strings) self.body.append( LegalDoc.SENTENCE_TOKENIZER.tokenize( l_detokenized_section)) # New section l_section_words = [l_word] else: # Add word to section words list l_section_words.append(l_word) # TODO - Timer stop Timers.s_gen_secs_from_corpus_timer.stop() # Creates a corpus from the sentences in the body's sections def generate_corpus_from_sections(self): """" This method will not work properly if punctuation has been removed or if all words have been lower cased """ # TODO - Timer start Timers.s_gen_corpus_from_secs_timer.start() # Check whether sentences are tokenized already if self.tokenized_sentences: # Create corpus from tokenized sentences for l_section in self.body: for l_sentence in l_section: for l_word in l_sentence: self.corpus.append(l_word) else: # Create corpus from untokenized sentences for l_section in self.body: for l_sentence in l_section: for l_word in word_tokenize(l_sentence): self.corpus.append(l_word) # TODO - Timer stop Timers.s_gen_corpus_from_secs_timer.stop() # Creates a corpus from the sentences in the body's sections def __clean_sections(self): if LegalDoc.CLEAN_DATA: # TODO - Timer start Timers.s_clean_sections_timer.start() # Tokenize sections self.tokenize_sentences() # Clean data l_filtered_sections = [] # By section for l_section in self.body: l_filtered_sentences = [] # By sentence for l_sentence in l_section: l_filtered_words = [] # By word for l_word in l_sentence: # Remove stopwords if l_word in LegalDoc.STOP_WORDS and LegalDoc.REMOVE_STOP_WORDS: continue # Stemming if LegalDoc.APPLY_STEMMING: l_word = LegalDoc.STEMMER.stem(l_word) # Lemmatization if LegalDoc.APPLY_LEMMATIZATION: l_word = LegalDoc.LEMMATIZER.lemmatize(l_word) # Remove punctuation if LegalDoc.REMOVE_PUNCTUATION: l_word = l_word.translate( str.maketrans('', '', string.punctuation)) # To lower case if LegalDoc.TO_LOWER_CASE: l_word = l_word.lower() # Add filtered word to sentence l_filtered_words.append(l_word) # Add filtered sentence to section l_filtered_sentences.append(l_filtered_words) # Add filtered section to section list l_filtered_sections.append(l_filtered_sentences) # Update body self.__f_body = l_filtered_sections # Create corpus from sections self.generate_corpus_from_sections() # Detokenize sentences self.detokenize_sentences() # TODO - Timer stop Timers.s_clean_sections_timer.stop() # Strip section identifiers def strip_section_identifiers(self, a_generate_corpus=True): for l_section in self.body: for i, l_sentence in enumerate(l_section): l_section[i] = LegalDoc.SECTION_IDENTIFIER_PATTERN.sub( "", l_sentence, 1) if a_generate_corpus: self.generate_corpus_from_sections() # Save formatting as a txt file def write(self, a_raw_text=False, a_prefix="", a_new_path=""): """" Writes the data in this instance to a .TXT file The case name and number are used to name the file """ # TODO - Timer start Timers.s_write_timer.start() # Make sure that "CaseName" and "CaseNumber" do not contain illegal values and are not excessively long l_safe_file_name = re.sub(r'[\\/:"*?<>|]+', "", self.file_name) l_safe_file_name = ( l_safe_file_name[:25] + '..') if len(l_safe_file_name) > 25 else l_safe_file_name # l_safe_case_number = re.sub(r'[\\/:"*?<>|]+', "", self.case_number) # l_safe_case_number = (l_safe_case_number[:25] + '..') if len(l_safe_case_number) > 25 else l_safe_case_number # Add brackets to prefix if specified if a_prefix: a_prefix = "(" + a_prefix + ")" # Select path string based on input if a_new_path: l_path = a_new_path else: l_path = "Resources/Output/Formatted/" # Make path if it doesn't exist if not os.path.exists(l_path): os.makedirs(l_path) # Save file l_save_file = None try: l_save_file = open(l_path + a_prefix + "(F) " + l_safe_file_name, "w", encoding="UTF-8") # Remove section identifiers self.strip_section_identifiers(True) # Write formatted LegalDoc if not a_raw_text: l_save_file.write(self.__str__()) # Only write body's contents (unformatted) else: for l_section in self.body: for l_sentence in l_section: l_save_file.write(l_sentence + " \n") l_save_file.close() # TODO - Timer start Timers.s_write_timer.stop() # Handle IO Exception except IOError: print("ERROR: Unable to save file with path: " + self.path) l_save_file.close() # TODO - Timer start Timers.s_write_timer.stop() # ----Method Overrides---- # Override str(self) with formatted body output def __str__(self): # Write field data l_info = "FIELD DATA:\n" l_info += "\tFILE NAME:\n\t\t" + self.file_name + "\n" l_info += "\tCASE NUMBER:\n\t\t" + self.case_number + '\n' l_info += "\tJUDGE NAME:\n\t\t" + str(self.judge_name) + '\n' l_info += "\tDEFENDANT NAME:\n\t\t" + str(self.defendant_name) + '\n' l_info += "\tPRISON DOCUMENT:\n\t\t" + str( self.sentencing_document) + '\n' l_info += "\tPUNCTUATION REMOVED:\n\t\t" + str( self.punctuation_removed) + '\n' l_info += "\tLOWER CASE:\n\t\t" + str(self.lower_case) + '\n' l_info += "\tSTOP WORDS REMOVED:\n\t\t" + str( self.stop_words_removed) + '\n' l_info += "\tSTEMMED:\n\t\t" + str(self.stemmed) + '\n' l_info += "\tLEMMATIZED:\n\t\t" + str(self.lemmatized) + '\n' l_info += "\tCONTAINS ERRORS:\n\t\t" + str(self.contains_errors) + '\n' l_info += "SECTIONS:" + '\n' # Write the section headers for i in range(0, len(self.body)): l_section = self.body[i] l_info += '\t' "Section:" + str(i) + '\n' # Write the sentences corresponding to the above section for l_sentence in l_section: l_info += "\t\t" + l_sentence + '\n' return l_info # ----Class Methods---- # Prints all the exception data in the exception dict as well as some basic summary statistics @classmethod def print_exception_data(cls): # Write general error data l_error_data = "Successful initialisations: " + str( cls.s_successful_init_count) + '\n' l_error_data += "Failed initialisations: " + str( cls.s_failed_init_count) + '\n' l_error_data += "Success rate: " + \ str((cls.s_successful_init_count * 1.0) / ((cls.s_failed_init_count * 1.0) + (cls.s_successful_init_count * 1.0))) + '\n' l_error_data += ("File errors: " + str(LegalDoc.s_file_error) + "Case errors: " + str(LegalDoc.s_case_error) + "Judge errors: " + str(LegalDoc.s_judge_error) + "Defendant errors: " + str(LegalDoc.s_defendant_error) + "Sentencing errors: " + str(LegalDoc.s_sentencing_error) + "Section errors: " + str(LegalDoc.s_section_error) + "Body errors: " + str(LegalDoc.s_body_error) + "Unknown errors: " + str(LegalDoc.s_unknown_error)) l_error_data += "Exceptions: " + '\n' # For each LegalDoc containing one or more errors # l_path is the key, l_errors is the value for l_path, l_errors in cls.s_exception_data_dict.items(): # Write the path of the LegalDoc l_error_data += "\t" + l_path + '\n' # Write the errors associated with the above LegalDoc for l_error in l_errors: l_error_data += "\t\t" + l_error + '\n' print(l_error_data) # Notes an exception @classmethod def __note_exception(cls, a_path: str, a_exception: str, a_failed_init: bool): """" Adds the provided exception data to the exception dict using the provided path. Also increments the classes failed init count :type a_path: str :type a_exception: str :type a_failed_init: bool """ # If the path already exists in the exception dictionary if a_path in cls.s_exception_data_dict: cls.s_exception_data_dict[a_path].append(a_exception) # Add the path to the exception dictionary else: cls.s_exception_data_dict[a_path] = [a_exception] if a_failed_init: # Increment static counter for failed initialisation cls.s_failed_init_count += 1 # TODO Work on this method @classmethod def get_docs_by_regex(cls, a_regex, a_filter_labelled): """ Gets a list of LegalDocs whose bodies' match the provided pattern :param str a_regex: A pattern used to get LegalDocs :return: A list of LegalDocs :rtype: list """ assert (isinstance(a_regex, re.Pattern)) l_matching_docs = [] l_break = False for l_legal_doc in LegalDoc.s_legal_doc_dict.values(): l_break = False for l_section in l_legal_doc.body: if l_break: break for l_sentence in l_section: # Look for match l_match = a_regex.match(l_sentence) if l_match: # TODO Annoying import bug, fix later if not Label.Label.s_flat_labels_dict[ l_legal_doc. file_name] or not a_filter_labelled: l_matching_docs.append(l_legal_doc.file_name) l_break = True break print(l_matching_docs) print(len(l_matching_docs)) # ----Properties (Read only getters)---- # Origin path @property def path(self): """" :rtype: str :return: The path of the file that originally generated this LegalDoc instance """ return self.__f_path # File name @property def file_name(self): """" :rtype: str :return: The name of the file that originally generated this LegalDoc instance """ return self.__f_file_name # Head @property def head(self): """" :rtype: str :return: Summary information of the court proceeding """ return self.__f_head # Body @property def body(self): """" :rtype: list :return: The transcript of the court proceeding. Broken down into sections (list) comprised of sentences (str) """ return self.__f_body # Case number @property def case_number(self): """" :rtype: str :return: The document's case number """ return self.__f_case_number # Judge's name @property def judge_name(self): """" :rtype: str :return: The judge's name """ return self.__f_judge_name # Defendant name @property def defendant_name(self): """" :rtype: str :return: The defendant's name """ return self.__f_defendant_name # Sentencing document @property def sentencing_document(self): """" :rtype: bool :return: Whether the document pertains to the sentencing of an individual """ return self.__f_sentencing_document # Punctuation Removed @property def punctuation_removed(self): """" :rtype: bool :return: Whether the document's contents have had punctuation removed """ return self.__f_punctuation_removed # Lower case @property def lower_case(self): """" :rtype: bool :return: Whether the document's contents have been converted to lower case """ return self.__f_lower_case # Stop words removed @property def stop_words_removed(self): """" :rtype: bool :return: Whether the document's contents have been stripped of stop words """ return self.__f_stop_words_removed # Stemmed @property def stemmed(self): """" :rtype: bool :return: Whether the document's contents have been stemmed """ return self.__f_stemmed # Lemmatized @property def lemmatized(self): """" :rtype: bool :return: Whether the document's contents have been lemmatized """ return self.__f_lemmatized # Contains errors @property def contains_errors(self): """" :rtype: bool :return: Whether the document contains broken sections or a missing case number or both """ return self.__f_contains_errors # Tokenized sentences @property def tokenized_sentences(self): """" :rtype: bool :return: Whether the document's sections contain tokenized sentences """ return self.__f_tokenized_sentences # Corpus @property def corpus(self): """" :rtype: list :return: Corpus with each index corresponding to a single word """ return self.__f_corpus # ----Static Methods---- @staticmethod def index(a_list, a_value): """Locate the leftmost value exactly equal to x""" i = bisect.bisect_left(a_list, a_value) if i != len(a_list) and a_list[i] == a_value: return i return None
def __init__(self, language): self.language = language self.detokenizer = MosesDetokenizer(lang=language)
def morph(self, source, reference, constrain_pos=True): # Return Format (raw, translation, is attack success, query number, modif_rate) orig_tokenized = MosesTokenizer(lang='en').tokenize(source) # skip too long or too short question if len(orig_tokenized) < 10 or len(orig_tokenized) > 100: return source, reference, None, None, None # generate candidates pos_tagged = [ (tagged[0], '.') if '&' in tagged[0] else tagged for tagged in nltk.pos_tag(orig_tokenized, tagset='universal') ] token_inflections = self.get_inflections(orig_tokenized, pos_tagged, constrain_pos) # get original bleu original_bleu, orig_predicted = self.get_bleu(source, reference) # skip examples already have glue == 0 if original_bleu == 0: return source, reference, None, None, None forward_perturbed, forward_bleu, forward_predicted, num_queries_forward = self.search_nmt( token_inflections, orig_tokenized, source, original_bleu, reference) if forward_bleu == original_bleu: forward_predicted = orig_predicted # attack success if forward_bleu == 0: modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed) attack_text = MosesDetokenizer( lang='en').detokenize(forward_perturbed) return attack_text, forward_predicted, True, num_queries_forward + 1, modif_rate backward_perturbed, backward_bleu, backward_predicted, num_queries_backward = self.search_nmt( token_inflections, orig_tokenized, source, original_bleu, reference, backward=True) if backward_bleu == original_bleu: backward_predicted = orig_predicted num_queries = 1 + num_queries_forward + num_queries_backward if forward_bleu < backward_bleu: is_attack_success = False if forward_bleu == 0: is_attack_success = True modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed) attack_text = MosesDetokenizer( lang='en').detokenize(forward_perturbed) return attack_text, forward_predicted, is_attack_success, num_queries, modif_rate else: is_attack_success = False if backward_bleu == 0: is_attack_success = True modif_rate = self.get_modif_rate(orig_tokenized, backward_perturbed) attack_text = MosesDetokenizer( lang='en').detokenize(backward_perturbed) return attack_text, backward_predicted, is_attack_success, num_queries, modif_rate
def local_search_nmt(self, token_inflections, orig_tokenized, original, original_bleu, reference, backward=False): perturbed_tokenized = orig_tokenized.copy() best_bleu = original_bleu num_queries = 0 best_predicted = '' detokenizer = MosesDetokenizer(lang='en') while True: new_tokenized_list = [] new_bleu_list = [] new_predicted_list = [] for position, candidates in token_inflections: # list of pairs (position, candidates) candidates: list of token # add or swap for infl in candidates: if perturbed_tokenized[position] == infl: continue # do replace new_tokenized = perturbed_tokenized.copy() new_tokenized[position] = infl # form text and eval new_text = detokenizer.detokenize(new_tokenized) new_bleu, new_predicted = self.get_bleu( new_text, reference) num_queries += 1 # record new_tokenized_list.append(new_tokenized) new_bleu_list.append(new_bleu) new_predicted_list.append(new_predicted) # remove if perturbed_tokenized[position] != orig_tokenized[position]: # do replace new_tokenized = perturbed_tokenized.copy() new_tokenized[position] = orig_tokenized[position] # form text and eval new_text = detokenizer.detokenize(new_tokenized) new_bleu, new_predicted = self.get_bleu( new_text, reference) num_queries += 1 # record new_tokenized_list.append(new_tokenized) new_bleu_list.append(new_bleu) new_predicted_list.append(new_predicted) if len(new_bleu_list) == 0: # no improve break cur_best_idx = np.argsort(new_bleu_list)[0] cur_best_bleu = new_bleu_list[cur_best_idx] cur_best_predicted = new_predicted_list[cur_best_idx] cur_best_tokenized = new_tokenized_list[cur_best_idx] # check stop criteria if cur_best_bleu == 0: perturbed_tokenized = cur_best_tokenized best_bleu = cur_best_bleu best_predicted = cur_best_predicted break if cur_best_bleu < best_bleu - EPSILON: perturbed_tokenized = cur_best_tokenized best_bleu = cur_best_bleu best_predicted = cur_best_predicted else: break # =============== check supplement set ====================== # form supplement set supplement_inflections_by_position = { position: [] for position, _ in token_inflections } for position, candidates in token_inflections: for infl in candidates: if perturbed_tokenized[position] != infl: supplement_inflections_by_position[position].append(infl) is_sup_valid = True valid_positions = [] for position, _ in token_inflections: if len(supplement_inflections_by_position[position]) > 1: is_sup_valid = False break if len(supplement_inflections_by_position[position]) == 1: valid_positions.append(position) if len(valid_positions) == 0: is_sup_valid = False if is_sup_valid: print('check supplement') supplement_tokenized = perturbed_tokenized.copy() for position in valid_positions: supplement_tokenized[ position] = supplement_inflections_by_position[position][0] # form text and eval supp_text = detokenizer.detokenize(supplement_tokenized) supp_bleu, supp_predicted = self.get_bleu(supp_text, reference) num_queries += 1 if supp_bleu < best_bleu: best_bleu = supp_bleu best_predicted = supp_predicted perturbed_tokenized = supplement_tokenized return perturbed_tokenized, best_bleu, best_predicted, num_queries
def hash_fn(self, string_to_hash): '''Simple hash function''' md = MosesDetokenizer() if type(string_to_hash) == list: string_to_hash = md.detokenize(string_to_hash) return hashlib.sha224(string_to_hash.encode('utf-8')).hexdigest()
Created on Tue Feb 19 15:06:29 2019 @author: peterawest """ from itertools import combinations import torch from os import listdir from random import shuffle from nltk.tokenize import sent_tokenize from sacremoses import MosesTokenizer, MosesDetokenizer from gpt2_token_mod import gpt2_split, gpt2_join from pytorch_pretrained_bert import GPT2Tokenizer mt = MosesTokenizer() mdt = MosesDetokenizer() punctuation = ".,:;\"\'" global_tokenizer = None def token_split(s, method='split', tokenizer=None): ''' Given a string s, tokenize ''' if method == 'split': return s.split() if method == 'moses': tokenized_text = mt.tokenize(s, return_str=True) return tokenized_text.split() if method == 'gpt2': if tokenizer is None: global global_tokenizer
finnish_stanza_tags_tup.append((j.text, j.xpos)) finnish_stanza_tags.append(j.xpos) tups_to_file('finlandes/OUTPUT.txt', finnish_stanza_tags_tup) print("Finished tagging in Finnish: Helsinkiin") # Chinese # Tagging from the corpus GSDSimp print("Started tagging in Chinese: GSDSimp") chinese_dep_parse = '../../dependency/UD_Chinese-GSDSimp-master/zh_gsdsimp-ud-test.conllu' with open(chinese_dep_parse, 'r') as gsdsimp_f: gsdsimp_text = conll_text_reader(gsdsimp_f) detok = MosesDetokenizer() with open('chino/INPUT.txt', 'w') as f: for s in gsdsimp_text: sent = detok.detokenize(s) f.write(sent + '\n') nlp_zh = stanza.Pipeline(processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True, lang='zh') stanza_model = nlp_zh(gsdsimp_text) chinese_stanza_tags_tup = [] chinese_stanza_tags = [] for i in stanza_model.sentences: for j in i.words: chinese_stanza_tags_tup.append((j.text, j.xpos))
class SummaryPicker: def __init__(self, exp_path): self.exp_path = exp_path self.load_experiment() self.detokenizer = MosesDetokenizer(lang='en') self.truecaser = MosesTruecaser(load_from='sm.cnndm.tc.model') def cleanup(self, line, append=False): # todo, use proper regex line = line.replace('- lrb -', '(') line = line.replace('- rrb -', ')') line = line.replace('- lsb -', '[') line = line.replace('- rsb -', ']') line = line.replace('`', "'") line = self.detokenizer.detokenize(line.split(' ')) # line = line.replace(" 's ", "'s ") # line = line.replace(" 'd ", "'d ") # line = line.replace("' s ", "'s ") # line = line.replace(" n '", "n'") # line = line.replace(" n' ", "n'") line = line.replace(" - - ", " -- ") line = line.replace(" - ", "-") line = re.sub(r', (\d{3})', r',\1', line) line = line.replace("i' m ", "i'm ") line = line.replace(" 'll ", "'ll ") line = line.replace("' ll ", "'ll ") line = re.sub(r" '([a-zA-Z]{1}) ", r"'\1 ", line) line = re.sub(r"' ([a-zA-Z]{1}) ", r"'\1 ", line) line = re.sub(r" ([a-zA-Z]{1})' ", r" \1'", line) line = re.sub(r"(you|they)(' re )", r"\1're ", line) line = re.sub(r"(\$\d+\.) (\d+)", r"\1\2", line) line = re.sub(r"(\d{1,2}): (\d{1,2}) (am|pm)", r"\1:\2\3", line) line = line.replace(" n't ", "n't ") line = line.replace(" 've ", "'ve ") doc = nlp(line) lines = [] def repr_word(tok): txt = tok.text_with_ws if tok.text in SPECIAL: txt = txt.replace(tok.text, SPECIAL[tok.text]) elif tok.is_sent_start or tok.ent_type_ in ['PERSON', 'ORG', 'PRODUCT', 'GPE', 'LOC', 'FAC', 'NORP', 'EVENT', 'WORK_OF_ART']: txt = txt.capitalize() if tok.text.upper() in COMPANIES: txt = txt.upper() return txt for tok in doc: lines.append(repr_word(tok)) line = ''.join(lines) if not line.endswith('.') and append: line += ' ...' return line # line = line.replace(" 'm ", "'m ") # line = line.replace("' m ", "'m ") def cleanup_samples(self, samples): return [self.cleanup_sample(sample) for sample in samples] def cleanup_sample(self, sample): sample['source'] = self.cleanup(sample['source'], append=True) sample['summaries'] = { model: self.cleanup(line) for model, line in sample['summaries'].items()} return sample def load_corpus(self, text_path): with open(text_path,'r') as f: return f.read().splitlines() def sample(self, n=15, clean=True): indices = random.sample(range(len(self.source)), n) samples = [] for idx in indices: sample = {} sample['index'] = idx sample['source'] = self.source[idx] sample['summaries'] = {} sample['summaries']['gold'] = self.gold[idx] for key, docs in self.results.items(): sample['summaries'][key] = docs[idx] if clean: sample = self.cleanup_sample(sample) samples.append(sample) return samples def load_experiment(self): self.results = {} logging.info('Loading source articles') self.source = self.load_corpus(os.path.join(self.exp_path, 'src.txt')) self.gold = None for d in os.listdir(self.exp_path): res_path = os.path.join(self.exp_path, d) if os.path.isdir(res_path): if self.gold is None: logging.info('Loading gold summaries') self.gold = self.load_corpus(os.path.join(res_path, 'tar.txt')) assert len(self.gold) == len(self.source) logging.info('Loading {}'.format(res_path)) corpus = self.load_corpus(os.path.join(res_path, 'hyp.txt')) assert len(corpus) == len(self.source) self.results[os.path.basename(res_path)] = corpus
combi['tidy_tweet'] = combi['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3])) tokenized_tweet = combi['tidy_tweet'].apply(lambda x: x.split()) tokenized_tweet.head() from nltk.stem.porter import * stemmer = PorterStemmer() tokenized_tweet = tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming tokenized_tweet.head() #from mosestokenizer import MosesTokenizer, MosesDetokenizer #from nltk.tokenize.moses import MosesDetokenizer from sacremoses import MosesTokenizer, MosesDetokenizer detokenizer = MosesDetokenizer() for i in range(len(tokenized_tweet)): tokenized_tweet[i] = detokenizer.detokenize(tokenized_tweet[i], return_str=True) combi['tidy_tweet'] = tokenized_tweet all_words = ' '.join([text for text in combi['tidy_tweet']]) from wordcloud import WordCloud wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words) plt.figure(figsize=(10, 7)) plt.imshow(wordcloud, interpolation="bilinear") plt.axis('off') plt.show()
def __init__(self, exp_path): self.exp_path = exp_path self.load_experiment() self.detokenizer = MosesDetokenizer(lang='en') self.truecaser = MosesTruecaser(load_from='sm.cnndm.tc.model')
import requests import subprocess import json import os import tempfile from sacremoses import MosesTokenizer, MosesDetokenizer from collections import defaultdict from nltk import sent_tokenize SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) # PROCESSING TEXT tokenizer_en = MosesTokenizer(lang='en') detokenizer_en = MosesDetokenizer(lang='en') tokenizer_es = MosesTokenizer(lang='es') detokenizer_es = MosesDetokenizer(lang='es') MAX_NUM_TOKENS = 10 SPLIT_DELIMITER = ';' LANGUAGE_ISO_MAP = {'en': 'english', 'es': 'spanish'} def tokenize(text, lang, return_str=True): if lang == 'en': text_tok = tokenizer_en.tokenize(text, return_str=return_str, escape=False) return text_tok elif lang == 'es': text_tok = tokenizer_es.tokenize(text, return_str=return_str,
def get_detokenizer(): from sacremoses import MosesDetokenizer detok = MosesDetokenizer(lang='en') return detok
def __init__(self): super(RunHP, self).__init__() # GENERAL # self.seed = 42 self.cuda_device_id = 6 self.device = 'cuda' # 'cuda' or 'cpu' self.training_logging_step = 50 # how often to print internal metrics self.epochs = 10 # if set to 0 will immediately just to evaluation self.learning_rate = 0.0005 self.grads_clip = 0.25 # GENERAL DATA RELATED # self.dataset = 'amazon' self.train_max_groups_per_batch = 6 self.val_max_groups_per_batch = 13 self.eval_max_groups_per_batch = 20 self.max_rev_per_group = 8 # DATA SOURCES # # `early_term` limits the number of chunks per epoch self.train_early_term = None self.val_early_term = None self.gener_early_term = 2 # GENERAL PATHS # self.root_path = 'copycat' self.experiments_folder = 'first_run' self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}' self.checkpoint_full_fn = 'checkpoint.tar' epc = ExperimentsPathController() self.output_path = epc(self.output_dir) self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar' self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model' # DATA PATHS # self.base_data_path = f'data/{self.dataset}/' self.train_fp = comb_paths(self.base_data_path, "split/train/") self.val_fp = comb_paths(self.base_data_path, 'split/val/') self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt' self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv') self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv') # ANNEALING # self.c_m = 8. self.c_r = 0.8 self.c_kl_ann_max_val = 1. self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 self.z_m = 8. self.z_c = 0.8 self.z_kl_ann_max_val = 1. self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000 # DECODING/GENERATION # self.beam_size = 5 self.beam_len_norm = True self.beam_excl_words = [] self.block_ngram_repeat = 3 # or None self.ngram_mirror_window = 3 # or None self.mirror_conjs = ["and", 'or', ',', 'but'] # or None self.block_consecutive = True self.min_gen_seq_len = 20 # POST-PROCESSING AND ANALYTICS # mt = MosesTokenizer() self.tok_func = partial(mt.tokenize, escape=False) self.sent_split_func = nltk.sent_tokenize dt = MosesDetokenizer() self.detok_func = partial(dt.detokenize, unescape=False) true_caser = MosesTruecaser(load_from=self.tcaser_model_path, is_asr=True) self.true_case_func = partial(true_caser.truecase, return_str=True, use_known=True) self.analytics_func = partial(ngram_seq_analysis, tokenizer=self.tok_func, sent_splitter=self.sent_split_func, n_grams_to_comp=(2, 3, 4))
def __init__(self): super().__init__() self._tokenizer = SacreMosesTokenizer() self._detokenizer = MosesDetokenizer()
def load_model(self, src_language, trg_language, domain, bpe_src_code=None, tokenize=None): """ Load model for given trg language. """ # model_dir = "{}-{}".format(self._model_dir_prefix, trg_language) model_dir = f"{self._model_dir_prefix}{src_language}-{trg_language}-{domain}" # Load the checkpoint. ckpt_path = os.path.join(model_dir, 'model.ckpt') # Load the vocabularies. src_vocab_path = os.path.join(model_dir, 'src_vocab.txt') trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt') # Load the config. config_path = os.path.join(model_dir, 'config_orig.yaml') # Adjust config. config = load_config(config_path) new_config_file = os.path.join(model_dir, 'config.yaml') config = self._update_config(config, src_vocab_path, trg_vocab_path, model_dir, ckpt_path) with open(new_config_file, 'w') as cfile: yaml.dump(config, cfile) # print('Loaded model for {}-{}.'.format(self._src_language, trg_language)) print('Loaded model for {}-{}.'.format(src_language, trg_language)) conf = {} logger = logging.getLogger(__name__) conf["logger"] = logger # load the Joey configuration cfg = load_config(new_config_file) # load the checkpoint if "load_model" in cfg['training'].keys(): ckpt = cfg['training']["load_model"] else: ckpt = get_latest_checkpoint(model_dir) if ckpt is None: raise FileNotFoundError("No checkpoint found in directory {}." .format(model_dir)) # prediction parameters from config conf["use_cuda"] = cfg["training"].get("use_cuda", False) if torch.cuda.is_available() else False conf["level"] = cfg["data"]["level"] conf["max_output_length"] = cfg["training"].get("max_output_length", None) conf["lowercase"] = cfg["data"].get("lowercase", False) # load the vocabularies src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt" trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt" conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file, dataset=None, max_size=-1, min_freq=0) conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file, dataset=None, max_size=-1, min_freq=0) # whether to use beam search for decoding, 0: greedy decoding if "testing" in cfg.keys(): conf["beam_size"] = cfg["testing"].get("beam_size", 0) conf["beam_alpha"] = cfg["testing"].get("alpha", -1) else: conf["beam_size"] = 1 conf["beam_alpha"] = -1 # pre-processing if tokenize is not None: src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"]) trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"]) # tokenize input tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True) detokenizer = lambda x: trg_tokenizer.detokenize( x.split(), return_str=True) else: tokenizer = lambda x: x detokenizer = lambda x: x if bpe_src_code is not None and level == "bpe": # load bpe merge file merge_file = open(bpe_src_code, "r") bpe = apply_bpe.BPE(codes=merge_file) segmenter = lambda x: bpe.process_line(x.strip()) elif conf["level"] == "char": # split to chars segmenter = lambda x: list(x.strip()) else: segmenter = lambda x: x.strip() conf["preprocess"] = [tokenizer, segmenter] conf["postprocess"] = [detokenizer] # build model and load parameters into it model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"]) model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"]) model.load_state_dict(model_checkpoint["model_state"]) # ipdb.set_trace() if conf["use_cuda"]: model.cuda() conf["model"] = model print("Joey NMT model loaded successfully.") return conf
# 8) BLEU scores # 9) F-measure # start ther server using: # java -Djava.io.tmpdir=tmp/ -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer \ # -preload tokenize,ssplit,pos,lemma,ner,parse,depparse \ # -status_port 9000 -port 9000 -timeout 15000 & import os import csv from collections import Counter import random from nltk.parse import CoreNLPParser from nltk.stem import PorterStemmer from sacremoses import MosesTokenizer, MosesDetokenizer detokenizer = MosesDetokenizer() mt = MosesTokenizer() # wordnet_lemmatizer = WordNetLemmatizer() stemmer = PorterStemmer() # Lexical Parser parser = CoreNLPParser(url='http://localhost:9000') import kenlm import time # LM link: http://www.keithv.com/software/giga/ VP_2gram_LM = os.path.join("LMs", "lm_giga_64k_vp_2gram", "lm_giga_64k_vp_2gram.arpa") NVP_2gram_LM = os.path.join("LMs", "lm_giga_64k_nvp_2gram", "lm_giga_64k_nvp_2gram.arpa") VP_3gram_LM = os.path.join("LMs", "lm_giga_64k_vp_3gram",
def __init__(self, return_str=True): self._return_str = return_str from sacremoses import MosesDetokenizer # pylint: disable=import-outside-toplevel self._detokenizer = MosesDetokenizer()
def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "sent" in self.config.n_model: import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") from nltk.tokenize import sent_tokenize return PororoSentTokenizer(sent_tokenize, self.config) if self.config.n_model == "mecab_ko": try: import mecab except ModuleNotFoundError as error: raise error.__class__( "Please install python-mecab-ko with: `pip install python-mecab-ko`" ) model = mecab.MeCab() return PororoMecabKoTokenizer(model, self.config) if self.config.n_model == "char": return PororoCharTokenizer(self.config) if self.config.n_model == "jamo": return PororoJamoTokenizer(self.config) if self.config.n_model == "word": return PororoWordTokenizer(self.config) if self.config.n_model == "roberta": from fairseq.data.encoders.gpt2_bpe import get_encoder encoder = download_or_load("misc/encoder.json", self.config.lang) vocab = download_or_load("misc/vocab.bpe", self.config.lang) model = get_encoder(encoder, vocab) with open(encoder, "r") as f_vocab: vocab = json.load(f_vocab) inv_dict = {v: k for k, v in vocab.items()} return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config) if self.config.n_model == "moses": try: from sacremoses import MosesDetokenizer, MosesTokenizer except ModuleNotFoundError as error: raise error.__class__( "Please install sacremoses with: `pip install sacremoses`") model = MosesTokenizer(lang="en") detok = MosesDetokenizer(lang="en") return PororoMosesTokenizer(model, detok, self.config) if self.config.n_model == "jieba": try: import jieba except ModuleNotFoundError as error: raise error.__class__( "Please install jieba with: `pip install jieba`") model = jieba.cut return PororoJiebaTokenizer(model, self.config) if self.config.n_model == "mecab": try: import fugashi except ModuleNotFoundError as error: raise error.__class__( "Please install fugashi with: `pip install fugashi`") try: import ipadic except ModuleNotFoundError as error: raise error.__class__( "Please install ipadic with: `pip install ipadic`") dic_dir = ipadic.DICDIR mecabrc = os.path.join(dic_dir, "mecabrc") mecab_option = "-d {} -r {} ".format( dic_dir, mecabrc, ) model = fugashi.GenericTagger(mecab_option) return PororoMecabTokenizer(model, self.config) else: from pororo.tasks.utils.tokenizer import CustomTokenizer path = download_or_load( f"tokenizers/{self.config.n_model}.zip", self.config.lang, ) ext = "json" if "unigram" not in self.config.n_model else "txt" merges_filename = (f"{path}/merges.txt" if "unigram" not in self.config.n_model else None) model = CustomTokenizer.from_file( vocab_filename=f"{path}/vocab.{ext}", merges_filename=merges_filename, normalize=True if "jpe" not in self.config.n_model else False, ) if "jpe" in self.config.n_model: return PororoJamoPairTokenizer(model, self.config) if "mecab.bpe" in self.config.n_model: return PororoMecabSPTokenizer(model, self.config) return PororoSPTokenizer(model, self.config)
"""Do moses tok detok.""" # pylint: disable=invalid-name, unused-import import sys try: import sacremoses # noqa: F401 except ModuleNotFoundError: import subprocess as sp import shlex proc = sp.Popen(shlex.split('pip install sacremoses'), stdout=-1, stderr=-1) out, err = proc.communicate() if err: sys.stderr.write('error: %s' % err.decode()) sys.stdout.write('%s' % out.decode()) from sacremoses import MosesTokenizer, MosesDetokenizer MTOK = MosesTokenizer().tokenize MDETOK = MosesDetokenizer().detokenize mtok = MTOK mdetok = MDETOK
class NPEndPatternExtractor(SkillExtractor): """Identify noun phrases with certain ending words (e.g 'skills', 'abilities') as skills Args: endings (list): Single words that should identify the ending of a noun phrase as being a skill stop_phrases (list): Noun phrases that should not be considered skills only_bulleted_lines (bool, default True): Whether or not to only consider lines that look like they are items in a list """ def __init__(self, endings, stop_phrases, only_bulleted_lines=True, *args, **kwargs): self.endings = endings self.stop_phrases = stop_phrases self.only_bulleted_lines = only_bulleted_lines self.detokenizer = MosesDetokenizer() def document_skill_counts(self, document): """Count skills in the document Args: document (string) A document for searching, such as a job posting Returns: (collections.Counter) skill occurrences in the document """ skill_counts = Counter() for cleaned_phrase, _ in self.noun_phrases_matching_endings(document): skill_counts[cleaned_phrase] += 1 return skill_counts def candidate_skills(self, job_posting): """Generate candidate skills from the job posting Args: job_posting (job_postings.JobPosting) A single job posting Yields: all candidate skills (algorithms.skill_extractors.base.CandidateSkill) found in the job posting """ document = job_posting.get("description") for cleaned_phrase, context in self.noun_phrases_matching_endings( document): orig_context = self.detokenizer.detokenize([t[0] for t in context], return_str=True) logging.info('Yielding candidate skill %s in context %s', cleaned_phrase, orig_context) yield CandidateSkill(skill_name=cleaned_phrase, matched_skill=cleaned_phrase, confidence=95, context=orig_context) def noun_phrases_matching_endings(self, document): """From the given document, generate noun phrases ending with one of the configured terms Args: document (string) A raw text document, such as a job posting Yields: tuples, each with two strings: - a noun phrase - the context of the noun phrase (currently defined as the surrounding sentence) """ document = str(document) lines = document.split('\n') for line in lines: if not self.only_bulleted_lines or is_bulleted(line): for noun_phrase, context in noun_phrases_in_line_with_context( line): term_list = noun_phrase.split() if term_list[-1].lower() in self.endings: cleaned_phrase = clean_beginning(noun_phrase).lower() if cleaned_phrase not in self.stop_phrases: yield cleaned_phrase, context
def score_output(args, fname): sp = spm.SentencePieceProcessor() sp.Load('data_and_models/sim/sim.sp.30k.model') detok = MosesDetokenizer('en') tok = TreebankWordTokenizer() f = open(fname, 'r') lines = f.readlines() pairs = [] pairs_bleu = [] src = None for i in lines: if i[0] == "T": target = i.split()[1:] target = " ".join(target).replace("@@ ", "") target_bleu = target target_sim = make_example(target, detok, tok, sp) elif i[0] == "H": hyp = i.split()[2:] hyp = " ".join(hyp).replace("@@ ", "") hyp_bleu = hyp hyp_sim = make_example(hyp, detok, tok, sp) elif i[0] == "S": if src is not None: pairs.append((target_sim, hyp_sim, src_sim)) pairs_bleu.append((target_bleu, hyp_bleu, src_bleu)) src = i.split()[1:] src = " ".join(src).replace("@@ ", "") src_bleu = src src_sim = make_example(src, detok, tok, sp) pairs.append((target_sim, hyp_sim, src_sim)) pairs_bleu.append((target_bleu, hyp_bleu, src_bleu)) model = torch.load(args.sim_model_file, map_location='cpu') state_dict = model['state_dict'] vocab_words = model['vocab_words'] sim_args = model['args'] model = WordAveraging(sim_args, vocab_words) model.load_state_dict(state_dict, strict=True) scores = [] scores_simile = [] for i in pairs: wp1 = Example(i[0]) wp1.populate_embeddings(model.vocab) wp2 = Example(i[1]) wp2.populate_embeddings(model.vocab) wx1, wl1, wm1 = model.torchify_batch([wp1]) wx2, wl2, wm2 = model.torchify_batch([wp2]) score = model.scoring_function(wx1, wm1, wl1, wx2, wm2, wl2) ref_l = len(i[0]) hyp_l = len(i[1]) lp = np.exp(1 - max(ref_l, hyp_l) / float(min(ref_l, hyp_l))) simile = lp**args.length_penalty * score.data[0] scores_simile.append(simile) scores.append(score.data[0]) print("SIM: {0}".format(np.mean(scores))) print("SimiLe: {0}".format(np.mean(scores_simile))) fout = open(fname + ".target.out", "w") for i in pairs_bleu: fout.write(i[0].strip() + "\n") fout.close() fout = open(fname + ".hyp.out", "w") for i in pairs_bleu: fout.write(i[1].strip() + "\n") fout.close() fout = open(fname + ".src.out", "w") for i in pairs_bleu: fout.write(i[2].strip() + "\n") fout.close() cmd = "perl multi-bleu.perl {0} < {1}".format(fname + ".target.out", fname + ".hyp.out") os.system(cmd)
}, JOINT_TUNING: { 'train': gold_train_data_source, 'val': gold_val_data_source } } # TRUECASER # tcaser = MosesTruecaser(load_from=run_conf.tcaser_model_path, is_asr=True) tcase_func = partial(tcaser.truecase, return_str=True, use_known=True) # WORD TOKENIZERS / DE-TOKENIZERS # mt = MosesTokenizer() dt = MosesDetokenizer() # SUB-WORD TOKENIZER # bpe = BPE(glossaries=SPECIAL_TOKENS) bpe.load(bpcodes_fp=run_conf.bpe_fp) unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split()) gold_tok_func = lambda x: bpe.tokenize(mt.tokenize(tcase_func(x), escape=False) ) detok_func = lambda x: dt.detokenize(bpe.detokenize(x), unescape=False) # DATA PIPELINES AND VOCAB # vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT, lowercase=False,
# Corpus BLEU with arguments # Run this file from CMD/Terminal # Example Command: python3 compute-bleu-args.py test_file_name.txt mt_file_name.txt import sys import sacrebleu # Only if you originally used MosesTokenizer from sacremoses import MosesDetokenizer md = MosesDetokenizer(lang='en') target_test = sys.argv[1] # Test file argument target_pred = sys.argv[2] # MTed file argument # Open the test dataset human translation file and detokenize the references refs = [] with open(target_test) as test: for line in test: line = line.strip().split() line = md.detokenize(line) refs.append(line) print("Reference 1st sentence:", refs[0]) refs = [refs] # Yes, it is a list of list(s) as required by sacreBLEU # Open the translation file by the NMT model and detokenize the predictions preds = [] with open(target_pred) as pred: