def post_process(pred, amr_version): pred = os.path.realpath(pred) utils_tar_gz = get_amr_utils(amr_version) util_dir = get_resource(utils_tar_gz) stog_home = get_resource( 'https://github.com/jcyk/AMR-gs/archive/master.zip') with pushd(stog_home): run_cmd( f'python3 -u -m stog.data.dataset_readers.amr_parsing.postprocess.postprocess ' f'--amr_path {pred} --util_dir {util_dir} --v 2') return pred + '.post'
def convert_jsonlines_to_IOBES(json_file, output_file=None, doc_level_offset=True): json_file = get_resource(json_file) if not output_file: output_file = os.path.splitext(json_file)[0] + '.ner.tsv' with open(json_file) as src, open(output_file, 'w', encoding='utf-8') as out: for line in src: doc = json.loads(line) offset = 0 for sent, ner in zip(doc['sentences'], doc['ner']): tags = ['O'] * len(sent) for start, end, label in ner: if doc_level_offset: start -= offset end -= offset if start == end: tags[start] = 'S-' + label else: tags[start] = 'B-' + label for i in range(start + 1, end + 1): tags[i] = 'I-' + label tags[end] = 'E-' + label offset += len(sent) for token, tag in zip(sent, tags): out.write(f'{token}\t{tag}\n') out.write('\n')
def load_language_model(cls, model_file): model_file = get_resource(model_file) state = torch.load(model_file) model = RNNLanguageModel(state['n_tokens'], state['is_forward_lm'], state['hidden_size'], state['embedding_size']) model.load_state_dict(state['state_dict'], strict=False) return model
def evaluate(gold_file, pred_file): """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) Args: gold_file(str): The gold conllx file pred_file(str): The pred conllx file Returns: """ gold_file = get_resource(gold_file) fixed_pred_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, pred_file, fixed_pred_file, keep_comments=False) if gold_file.endswith('.conllu'): fixed_gold_file = tempfile.NamedTemporaryFile().name copy_cols(gold_file, gold_file, fixed_gold_file, keep_comments=False) gold_file = fixed_gold_file exitcode, out, err = get_exitcode_stdout_stderr( f'perl {CONLLX_EVAL} -q -b -g {gold_file} -s {fixed_pred_file}') if exitcode: raise RuntimeError( f'eval.pl exited with error code {exitcode} and error message {err} and output {out}.' ) lines = out.split('\n')[-4:] las = int(lines[0].split()[3]) / int(lines[0].split()[5]) uas = int(lines[1].split()[3]) / int(lines[1].split()[5]) return uas, las
def load_config(self, save_dir, filename='config.json', **kwargs): save_dir = get_resource(save_dir) self.config.load_json(os.path.join(save_dir, filename)) self.config.update(kwargs) # overwrite config loaded from disk for k, v in self.config.items(): if isinstance(v, dict) and 'classpath' in v: self.config[k] = Configurable.from_config(v) self.on_config_ready(**self.config)
def make_gold_conll(ontonotes_path, language): ontonotes_path = os.path.abspath(get_resource(ontonotes_path)) to_conll = get_resource( 'https://gist.githubusercontent.com/hankcs/46b9137016c769e4b6137104daf43a92/raw/66369de6c24b5ec47696ae307591f0d72c6f3f02/ontonotes_to_conll.sh' ) to_conll = os.path.abspath(to_conll) # shutil.rmtree(os.path.join(ontonotes_path, 'conll-2012'), ignore_errors=True) with pushd(ontonotes_path): try: flash( f'Converting [blue]{language}[/blue] to CoNLL format, ' f'this might take half an hour [blink][yellow]...[/yellow][/blink]' ) run_cmd(f'bash {to_conll} {ontonotes_path} {language}') flash('') except RuntimeError as e: flash( f'[red]Failed[/red] to convert {language} of {ontonotes_path} to CoNLL. See exceptions for detail' ) raise e
def __init__(self, field: str, path: str, trainable=False) -> None: super().__init__() self.field = field path = get_resource(path) f = os.path.join(path, 'forward.pt') b = os.path.join(path, 'backward.pt') self.f: RNNLanguageModel = RNNLanguageModel.load_language_model(f) self.b: RNNLanguageModel = RNNLanguageModel.load_language_model(b) if not trainable: for p in self.parameters(): p.requires_grad_(False)
def load_data(self, data, generate_idx=False): if self.should_load_file(data): if isinstance(data, str): data = get_resource(data) data = list(self.load_file(data)) if generate_idx: for i, each in enumerate(data): each[IDX] = i # elif isinstance(data, list): # data = self.load_list(data) return data
def __init__(self, filepath: str, src, dst=None, **kwargs) -> None: if not dst: dst = src + '_fasttext' self.filepath = filepath flash(f'Loading fasttext model {filepath} [blink][yellow]...[/yellow][/blink]') filepath = get_resource(filepath) with stdout_redirected(to=os.devnull, stdout=sys.stderr): self._model = fasttext.load_model(filepath) flash('') output_dim = self._model['king'].size super().__init__(output_dim, src, dst)
def smatch_eval(pred, gold, use_fast=False) -> Union[SmatchScores, F1_]: script = get_resource(_FAST_SMATCH_SCRIPT if use_fast else _SMATCH_SCRIPT) home = os.path.dirname(script) pred = os.path.realpath(pred) gold = os.path.realpath(gold) with pushd(home): flash('Running evaluation script [blink][yellow]...[/yellow][/blink]') cmd = f'bash {script} {pred} {gold}' text = run_cmd(cmd) flash('') return format_fast_scores(text) if use_fast else format_official_scores( text)
def __init__(self, mapper: Union[str, dict], src: str, dst: str = None) -> None: super().__init__(src, dst) self.mapper = mapper if isinstance(mapper, str): mapper = get_resource(mapper) if isinstance(mapper, str): self._table = load_json(mapper) elif isinstance(mapper, dict): self._table = mapper else: raise ValueError(f'Unrecognized mapper type {mapper}')
def read_conll(filepath: Union[str, TimingFileIterator], underline_to_none=False, enhanced_collapse_empty_nodes=False): sent = [] if isinstance(filepath, str): filepath: str = get_resource(filepath) if filepath.endswith( '.conllu') and enhanced_collapse_empty_nodes is None: enhanced_collapse_empty_nodes = True src = open(filepath, encoding='utf-8') else: src = filepath for idx, line in enumerate(src): if line.startswith('#'): continue line = line.strip() cells = line.split('\t') if line and cells: if enhanced_collapse_empty_nodes and '.' in cells[0]: cells[0] = float(cells[0]) cells[6] = None else: if '-' in cells[0] or '.' in cells[0]: # sent[-1][1] += cells[1] continue cells[0] = int(cells[0]) if cells[6] != '_': try: cells[6] = int(cells[6]) except ValueError: cells[6] = 0 logger.exception( f'Wrong CoNLL format {filepath}:{idx + 1}\n{line}') if underline_to_none: for i, x in enumerate(cells): if x == '_': cells[i] = None sent.append(cells) else: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent sent = [] if sent: if enhanced_collapse_empty_nodes: sent = collapse_enhanced_empty_nodes(sent) yield sent src.close()
def make_ontonotes_language_jsonlines(conll12_ontonotes_path, output_path=None, language='english'): conll12_ontonotes_path = get_resource(conll12_ontonotes_path) if output_path is None: output_path = os.path.dirname(conll12_ontonotes_path) for split in ['train', 'development', 'test']: pattern = f'{conll12_ontonotes_path}/data/{split}/data/{language}/annotations/*/*/*/*gold_conll' files = sorted(glob.glob(pattern, recursive=True)) assert files, f'No gold_conll files found in {pattern}' version = os.path.basename(files[0]).split('.')[-1].split('_')[0] if version.startswith('v'): assert all([version in os.path.basename(f) for f in files]) else: version = 'v5' lang_dir = f'{output_path}/{language}' if split == 'conll-2012-test': split = 'test' full_file = f'{lang_dir}/{split}.{language}.{version}_gold_conll' os.makedirs(lang_dir, exist_ok=True) print(f'Merging {len(files)} files to {full_file}') merge_files(files, full_file) v5_json_file = full_file.replace(f'.{version}_gold_conll', f'.{version}.jsonlines') print(f'Converting CoNLL file {full_file} to json file {v5_json_file}') labels, stats = convert_to_jsonlines(full_file, v5_json_file, language) print('Labels:') pprint(labels) print('Statistics:') pprint(stats) conll12_json_file = f'{lang_dir}/{split}.{language}.conll12.jsonlines' print( f'Applying CoNLL 12 official splits on {v5_json_file} to {conll12_json_file}' ) id_file = get_resource(f'http://conll.cemantix.org/2012/download/ids/' f'{language}/coref/{split}.id') filter_data(v5_json_file, id_file, conll12_json_file)
def load(self, save_dir: str, devices=None, **kwargs): save_dir = get_resource(save_dir) # flash('Loading config and vocabs [blink][yellow]...[/yellow][/blink]') if devices is None and self.model: devices = self.devices self.load_config(save_dir, **kwargs) self.load_vocabs(save_dir) flash('Building model [blink][yellow]...[/yellow][/blink]') self.model = self.build_model(**merge_dict(self.config, training=False, **kwargs, overwrite=True, inplace=True)) flash('') self.load_weights(save_dir, **kwargs) self.to(devices) self.model.eval()
def load_file(self, filepath): filepath = get_resource(filepath) # idx = 0 for words, tags in generate_words_tags_from_tsv(filepath, lower=False): # idx += 1 # if idx % 1000 == 0: # print(f'\rRead instances {idx // 1000}k', end='') if self.max_seq_len: start = 0 for short_sents in split_long_sentence_into( words, self.max_seq_len, self.sent_delimiter, char_level=self.char_level, hard_constraint=self.hard_constraint): end = start + len(short_sents) yield {'token': short_sents, 'tag': tags[start:end]} start = end else: yield {'token': words, 'tag': tags}
def __init__(self, data: str, batch_size, seq_len, tokenizer='char', eos='\n', strip=True, vocab=None, cache=False, transform: Union[Callable, List] = None) -> None: self.cache = cache self.eos = eos self.strip = strip super().__init__(transform) if isinstance(tokenizer, str): available_tokenizers = { 'char': ToChar('text', 'token'), 'whitespace': WhitespaceTokenizer('text', 'token') } assert tokenizer in available_tokenizers, f'{tokenizer} not supported, available options: {available_tokenizers.keys()} ' self.append_transform(available_tokenizers[tokenizer]) if vocab is None: vocab = Vocab() self.training = True else: self.training = vocab.mutable self.append_transform(AppendEOS('token', eos=eos)) self.append_transform(FieldToIndex('token', vocab)) self.batch_size = batch_size data = get_resource(data) self.data = data self.num_tokens = None self.load_file(data) self._fp = None if isinstance(seq_len, int): self.seq_len = lambda: seq_len else: self.seq_len = seq_len
def official_conll_05_evaluate(pred_path, gold_path): script_root = get_resource( 'http://www.lsi.upc.edu/~srlconll/srlconll-1.1.tgz') lib_path = f'{script_root}/lib' if lib_path not in os.environ.get("PERL5LIB", ""): os.environ['PERL5LIB'] = f'{lib_path}:{os.environ.get("PERL5LIB", "")}' bin_path = f'{script_root}/bin' if bin_path not in os.environ.get('PATH', ''): os.environ['PATH'] = f'{bin_path}:{os.environ.get("PATH", "")}' eval_info_gold_pred = run_cmd( f'perl {script_root}/bin/srl-eval.pl {gold_path} {pred_path}') eval_info_pred_gold = run_cmd( f'perl {script_root}/bin/srl-eval.pl {pred_path} {gold_path}') conll_recall = float( eval_info_gold_pred.strip().split("\n")[6].strip().split()[5]) / 100 conll_precision = float( eval_info_pred_gold.strip().split("\n")[6].strip().split()[5]) / 100 if conll_recall + conll_precision > 0: conll_f1 = 2 * conll_recall * conll_precision / (conll_recall + conll_precision) else: conll_f1 = 0 return conll_precision, conll_recall, conll_f1
def __init__(self, trn: str = None, dev: str = None, tst: str = None, sampler_builder: SamplerBuilder = None, dependencies: str = None, scalar_mix: ScalarMixWithDropoutBuilder = None, use_raw_hidden_states=False, lr=1e-3, separate_optimizer=False, cls_is_bos=True, sep_is_eos=False, char2concept_dim=128, cnn_filters=((3, 256), ), concept_char_dim=32, concept_dim=300, dropout=0.2, embed_dim=512, eval_every=20, ff_embed_dim=1024, graph_layers=2, inference_layers=4, num_heads=8, rel_dim=100, snt_layers=4, unk_rate=0.33, vocab_min_freq=5, beam_size=8, alpha=0.6, max_time_step=100, amr_version='2.0', **kwargs) -> None: super().__init__(**merge_locals_kwargs(locals(), kwargs)) self.vocabs = VocabDict() utils_dir = get_resource(get_amr_utils(amr_version)) self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
def on_config_ready(self, **kwargs): super().on_config_ready(**kwargs) utils_dir = get_resource(get_amr_utils(self.config.amr_version)) self.sense_restore = NodeRestore(NodeUtilities.from_json(utils_dir))
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ======================================================================== # -*- coding:utf-8 -*- # Author: hankcs import tempfile from elit.components.parsers.conll import read_conll from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr CONLLX_EVAL = get_resource( 'https://github.com/elikip/bist-parser/archive/master.zip' + '#bmstparser/src/utils/eval.pl') def evaluate(gold_file, pred_file): """Evaluate using official CoNLL-X evaluation script (Yuval Krymolowski) Args: gold_file(str): The gold conllx file pred_file(str): The pred conllx file Returns: """ gold_file = get_resource(gold_file)
def load_weights(self, save_dir, filename='model.pt', **kwargs): save_dir = get_resource(save_dir) filename = os.path.join(save_dir, filename) # flash(f'Loading model: {filename} [blink]...[/blink][/yellow]') self.model_.load_state_dict(torch.load(filename, map_location='cpu'), strict=False)
def evaluate(self, tst_data, save_dir=None, logger: logging.Logger = None, batch_size=None, output=False, **kwargs): if not self.model: raise RuntimeError('Call fit or load before evaluate.') if isinstance(tst_data, str): tst_data = get_resource(tst_data) filename = os.path.basename(tst_data) else: filename = None if output is True: output = self.generate_prediction_filename( tst_data if isinstance(tst_data, str) else 'test.txt', save_dir) if logger is None: _logger_name = basename_no_ext(filename) if filename else None logger = self.build_logger(_logger_name, save_dir) if not batch_size: batch_size = self.config.get('batch_size', 32) data = self.build_dataloader(**merge_dict(self.config, data=tst_data, batch_size=batch_size, shuffle=False, device=self.devices[0], logger=logger, overwrite=True)) dataset = data while dataset and hasattr(dataset, 'dataset'): dataset = dataset.dataset num_samples = len(dataset) if dataset else None if output and isinstance(dataset, TransformDataset): def add_idx(samples): for idx, sample in enumerate(samples): if sample: sample[IDX] = idx add_idx(dataset.data) if dataset.cache: add_idx(dataset.cache) criterion = self.build_criterion(**self.config) metric = self.build_metric(**self.config) start = time.time() outputs = self.evaluate_dataloader(data, criterion=criterion, filename=filename, output=output, input=tst_data, save_dir=save_dir, test=True, num_samples=num_samples, **merge_dict(self.config, batch_size=batch_size, metric=metric, logger=logger, **kwargs)) elapsed = time.time() - start if logger: if num_samples: logger.info( f'speed: {num_samples / elapsed:.0f} samples/second') else: logger.info(f'speed: {len(data) / elapsed:.0f} batches/second') return metric, outputs
def transform(self, **kwargs) -> Callable: vocab = Vocab() vocab.load(os.path.join(get_resource(self.path), 'vocab.json')) return TransformList(ContextualStringEmbeddingTransform(self.field), FieldToIndex(f'{self.field}_f_char', vocab), FieldToIndex(f'{self.field}_b_char', vocab))
def load_vocab(self, save_dir, filename='vocab.json'): save_dir = get_resource(save_dir) vocab = SerializableDict() vocab.load_json(os.path.join(save_dir, filename)) self.vocab.copy_from(vocab)
def load_vocabs(self, save_dir, filename='vocabs.json', vocab_cls=Vocab): save_dir = get_resource(save_dir) vocabs = SerializableDict() vocabs.load_json(os.path.join(save_dir, filename)) self._load_vocabs(self, vocabs, vocab_cls)
def load_from_meta_file(save_dir: str, meta_filename='meta.json', transform_only=False, load_kwargs=None, **kwargs) -> Component: """ Args: save_dir: meta_filename (str): The meta file of that saved component, which stores the classpath and version. transform_only: **kwargs: Returns: """ identifier = save_dir load_path = save_dir save_dir = get_resource(save_dir) if save_dir.endswith('.json'): meta_filename = os.path.basename(save_dir) save_dir = os.path.dirname(save_dir) metapath = os.path.join(save_dir, meta_filename) if not os.path.isfile(metapath): metapath = os.path.join(save_dir, 'config.json') if not os.path.isfile(metapath): tips = '' if save_dir.isupper(): from difflib import SequenceMatcher similar_keys = sorted(pretrained.ALL.keys(), key=lambda k: SequenceMatcher( None, save_dir, metapath).ratio(), reverse=True)[:5] tips = f'Check its spelling based on the available keys:\n' + \ f'{sorted(pretrained.ALL.keys())}\n' + \ f'Tips: it might be one of {similar_keys}' raise FileNotFoundError( f'The identifier {save_dir} resolves to a non-exist meta file {metapath}. {tips}' ) meta: dict = load_json(metapath) cls = meta.get('classpath', None) if not cls: cls = meta.get('class_path', None) # For older version assert cls, f'{meta_filename} doesn\'t contain classpath field' try: obj: Component = object_from_classpath(cls) if hasattr(obj, 'load'): if transform_only: # noinspection PyUnresolvedReferences obj.load_transform(save_dir) else: if load_kwargs is None: load_kwargs = {} if os.path.isfile(os.path.join(save_dir, 'config.json')): obj.load(save_dir, **kwargs) else: obj.load(metapath, **kwargs) obj.config['load_path'] = load_path return obj except Exception as e: eprint(f'Failed to load {identifier}. See traceback below:') eprint(f'{"ERROR LOG BEGINS":=^80}') traceback.print_exc() eprint(f'{"ERROR LOG ENDS":=^80}') from pkg_resources import parse_version model_version = meta.get("elit_version", "unknown") if model_version == '2.0.0': # Quick fix: the first version used a wrong string model_version = '2.0.0-alpha.0' model_version = parse_version(model_version) installed_version = parse_version(version.__version__) try: latest_version = get_latest_info_from_pypi() except: latest_version = None if model_version > installed_version: eprint( f'{identifier} was created with elit-{model_version}, ' f'while you are running a lower version: {installed_version}. ' ) if installed_version != latest_version: eprint(f'Please upgrade elit with:\n' f'pip install --upgrade elit\n') eprint( 'If the problem still persists, please submit an issue to https://github.com/emorynlp/elit/issues\n' 'When reporting an issue, make sure to paste the FULL ERROR LOG above.' ) exit(1)
# -*- coding:utf-8 -*- # Author: hankcs import os import tempfile from typing import List from elit.metrics.parsing.conllx_eval import copy_cols from elit.common.structure import SerializableDict from elit.metrics.parsing import iwpt20_xud_eval from elit.metrics.parsing.iwpt20_xud_eval import load_conllu_file from elit.utils.io_util import get_resource, get_exitcode_stdout_stderr UD_TOOLS_ROOT = get_resource( 'https://github.com/UniversalDependencies/tools/archive/1650bd354bd158c75836cff6650ea35cc9928fc8.zip' ) ENHANCED_COLLAPSE_EMPTY_NODES = os.path.join( UD_TOOLS_ROOT, 'enhanced_collapse_empty_nodes.pl') CONLLU_QUICK_FIX = os.path.join(UD_TOOLS_ROOT, 'conllu-quick-fix.pl') def run_perl(script, src, dst=None): if not dst: dst = tempfile.NamedTemporaryFile().name exitcode, out, err = get_exitcode_stdout_stderr( f'perl -I{os.path.expanduser("~/.local/lib/perl5")} {script} {src}') if exitcode: # cpanm -l ~/.local namespace::autoclean # cpanm -l ~/.local Moose
def __init__(self): """ :class:`EnglishTokenizer` splits the input text into linguistic tokens. """ super(EnglishTokenizer, self).__init__() # _inflection_lexicons resource_root = get_resource(ELIT_URL + 'tokenizer.zip') self.ABBREVIATION_PERIOD = read_word_set( os.path.join(resource_root, 'english_abbreviation_period.txt')) self.APOSTROPHE_FRONT = read_word_set( os.path.join(resource_root, 'english_apostrophe_front.txt')) self.MAP_CONCAT_WORD = read_concat_word_dict( os.path.join(resource_root, 'english_concat_words.txt')) self.HYPHEN_PREFIX = read_word_set( os.path.join(resource_root, 'english_hyphen_prefix.txt')) self.HYPHEN_SUFFIX = read_word_set( os.path.join(resource_root, 'english_hyphen_suffix.txt')) # regular expressions self.RE_NETWORK_PROTOCOL = re.compile( r'((http|https|ftp|sftp|ssh|ssl|telnet|smtp|pop3|imap|imap4|sip)(://))' ) """ :abc: <3 </3 <\3 (: ): \\: *: $: (-: (^: (= (; :) :( =) B) 8) :-) :^) :3 :D :p :| :(( :---) """ self.RE_EMOTICON = re.compile( r'(:\w+:|<[\\/]?3|[()\\|*$][-^]?[:=;]|[:=;B8]([-^]+)?[3DOPp@$*()\\/|]+)(\W|$)' ) """ [email protected] [email protected] [email protected] jinho:[email protected] """ self.RE_EMAIL = re.compile( r'[\w\-.]+(:\S+)?@(([A-Za-z0-9\-]+\.)+[A-Za-z]{2,12}|\d{1,3}(\.\d{1,3}){3})' ) """ &arrow; { ģ ģ """ self.RE_HTML_ENTITY = re.compile(r'&([A-Za-z]+|#[Xx]?\d+);') """ [1] (1a) {A} <a1> [***] [A.a] [A.1] [1.a] ((---)) """ self.RE_LIST_ITEM = re.compile( r'(([\[({<]+)(\d+[A-Za-z]?|[A-Za-z]\d*|\W+)(\.(\d+|[A-Za-z]))*([\])\}>])+)' ) """ don't don’t I'll HE'S """ self.RE_APOSTROPHE = re.compile( r'(?i)[a-z](n[\'\u2019]t|[\'\u2019](ll|nt|re|ve|[dmstz]))(\W|$)') """ a.b.c 1-2-3 """ self.RE_ABBREVIATION = re.compile(r'[A-Za-z0-9]([.-][A-Za-z0-9])*$') """ 10kg 1cm """ self.RE_UNIT = re.compile( r'(?i)(\d)([acdfkmnpyz]?[mg]|[ap]\.m|ch|cwt|d|drc|ft|fur|gr|h|in|lb|lea|mi|ms|oz|pg|qtr|yd)$' ) """ hello.World """ self.RE_FINAL_MARK_IN_BETWEEN = re.compile( r'([A-Za-z]{3,})([.?!]+)([A-Za-z]{3,})$')
def make_ner_tsv_if_necessary(json_file): json_file = get_resource(json_file) output_file = os.path.splitext(json_file)[0] + '.ner.tsv' if not os.path.isfile(output_file): convert_jsonlines_to_IOBES(json_file, output_file) return output_file