def extract(self, sentences): pdesc = parserdesc(self.lang) process, keep = pdesc.config # Patch: the RASP pos is in rpos key = Struct(form='form', pos=('rpos' if pdesc.name == 'rasp' else 'pos'), lem='lem') recs = [] for _, deps in JsonDepBuilder(key=key).build(sentences, keep=keep): recs.extend([(sid, process(dep)) for sid, dep in deps]) return recs
def parser_for(lang): """Create a Parser object for a specific language. :param lang: one of 'en', 'es', 'ru', 'fa'. :returns: a Parser object. """ pdesc = parserdesc(lang) return Parser(name=pdesc.name, config=pdesc.config, command=pdesc.command, debug=True, encoding='utf-8')
def main(): """ Runs LM to concept mapping. """ global REMAP_CONCEPT # ------------------------------------------------------------------- # # INITIALIZATION m4test = ix.IARPATestCommand('metam', 'Map LMs to target and source concepts.') # add some custom cmdline parameters aparser = m4test.getArgParser() cmdline, config = m4test.parseCmdLineConfig('m4mapping') in_jdata = m4test.getJSON() # ------------------------------------------------------------------- # # MAIN APPLICATION LOGIC lang = in_jdata['lang'] mappingsystems = config.getList('mappingsystems', lang=lang) if not mappingsystems: mappingsystems = ['CNMS', 'DSMS', 'DLS'] secondaryMappingThreshold = config.getFloat('secondarymappingthreshold', lang=lang, default=0.1) secondaryMinScore = config.getFloatFromComp('cnms', 'secondaryminscore', lang=lang, default=0.1) mappingLimit = config.getIntFromComp('cnms', 'sourcelimit', lang=lang, default=2) if secondaryMappingThreshold: m4test.setSecondaryMappingThreshold(secondaryMappingThreshold) conceptrank = config.getListFromComp('cnms', 'targetconceptranking', lang=lang) expansionTypes = config.getListFromComp('cnms', 'expansiontypes', lang=lang) expansionScoreScale = config.getFloatFromComp('cnms', 'expansionscorescale', lang=lang, default=1.0) dsmsdefaultrank = config.getIntFromComp('dsms', 'defaultrank', lang=lang, default=2) dsmsdefaultscore = config.getFloatFromComp('dsms', 'defaultscore', lang=lang, default=0.10) dsmsScoreStr = ':%s:%s' % (dsmsdefaultrank, dsmsdefaultscore) # initialize CNMS system # this is always used at least for target concept lookups cnmap = ConceptualNetworkMapper(in_jdata['lang'], cmdline.cachedir, useSE=cmdline.useSE, govOnly=True, disableFN=True, targetConceptRank=conceptrank, expansionTypes=expansionTypes, expansionScoreScale=expansionScoreScale) # ------------------------------------------------------------------- # # Invoke here the parser and add tags to the sentences element of the JSON input in_sentences = in_jdata['sentences'] # run POS/Lemmatizer for all languages except Persian (CNMS) if (lang != 'fa'): tt = mnjson.MNTreeTagger(lang) tt.cleanText(in_sentences) tt.run(in_sentences) tt.processLMs(in_sentences) # run dependency parser for Englishjunk if (lang in ('en', 'ru', 'es')) and ('DSMS' in mappingsystems): ss = [s['ctext'] for s in in_sentences] logger.info('begin parsing sentence block, lang: %s, len: %d' % (lang, len(ss))) out_jdata = parse(in_jdata['lang'], ss) logger.info('end parsing sentence block') mapping = Assigner(lang) else: out_jdata = in_jdata currentTestItem = '' parser_name = parserdesc(lang).name # XXX makes no sense! # for in_sent, parsed_sent, in_sent in zip(in_sentences, out_jdata['sentences'], in_jdata['sentences']): for in_sent, parsed_sent in zip(in_sentences, out_jdata['sentences']): testItemId = in_sent['id'].split(u':')[1] if testItemId != currentTestItem: currentTestItem = testItemId logger.warn('mapping sentences in %s', currentTestItem) if 'lms' not in in_sent: continue for lm in in_sent['lms']: source, target = lm['source'], lm['target'] # =============================================================== # TARGET CONCEPT MAPPING: ALWAYS USE CNMS # =============================================================== cnmap.runTargetMapping(lm) lm['extractor'] = 'CNMS' # remap targetconcepts if needed. this is a hack to deal with # IARPA's inconsistency about concept coverage if target.get('concept') in REMAP_CONCEPT: target['concept'] = REMAP_CONCEPT[target['concept']] # ================================================================ # CNMS # ================================================================ if 'CNMS' in mappingsystems: cnmap.runSourceMapping(lm, sourceMappingLimit=mappingLimit, minSecondaryScore=secondaryMinScore) # ================================================================ # DSMS MAPPING SYSTEM (formerly KMS) # ================================================================ if ((source.get('concept') in (None, 'NULL', 'NONE', '')) and ('DSMS' in mappingsystems) and (lang in ('en', 'ru', 'es'))): target_f = target['form'] if 'form' in target else target[ 'lemma'] source_f = source['form'] if 'form' in target else source[ 'lemma'] found_lms = False words = sorted(parsed_sent['word'], key=lambda w: w['idx']) twords = sorted(in_sent['word'], key=lambda w: w['idx']) # logger.info(pformat(in_sent['word'])) # Try looking for a relation first relations = parsed_sent[parser_name]['relations'] found_lms = find_lm5(target_f, source_f, relations) if not found_lms: found_lms = find_lm3(target_f, source_f, twords) # if not found_lms: # found_lms = find_lm4(target, source, words) logger.debug('DSMS: found_lms: %s' % found_lms) if found_lms: target_l, source_l, _r = found_lms[0] target['rlemma'] = target_l source['rlemma'] = source_l if _r != '-': r = _r.split('.')[0] if '.' in _r else _r dimensions = mapping.assign2(source_l, r) else: dimensions = mapping.gassign(source_l, target_l) scon = dimensions[0].upper() if dimensions else None else: scon = None target_l = target[ 'lemma'] if 'lemma' in target else target['form'] source_l = source[ 'lemma'] if 'lemma' in source else source['form'] # dd = ', '.join(' '.join(d) for d in deprels(words)) # log('could not find %s - %s in %s' % (target_f, source_f, dd)) source['concept'] = scon + dsmsScoreStr if scon else 'NONE' if scon: if source.get('extractor'): source['extractor'] += ':DSMS' else: source['extractor'] = 'DSMS' # ------------------------------------------------------------------- # # OUTPUT FILE GENERATION m4test.writeOutput(in_jdata)
update(self, f=f, **entries) def __call__(self, fn): self.f(fn, **vars(self)) if __name__ == '__main__': args = argparser().parse_args() def noext(fn): fn, _ = splitext(fn) return fn logging.basicConfig(level=logging.INFO) fnames = set(map(noext, flattened(iglob(a) for a in args.f))) desc = parserdesc(args.lang) config = desc.config # txtor = make_translator(desc.name, config[0], config[1]) txtor_args = (desc.name, config[0], config[1]) # writer = make_writer(args.dbtype, 'test', args.clear, args.lang) writer_args = (args.dbtype, 'test', args.clear, args.lang) # process_fn = partial(process, writer_args=writer_args, txtor=txtor, chunk=args.chunk) process_fn = Partial(process, writer_args=writer_args, txtor_args=txtor_args, chunk=args.chunk) msg = """\ Running parameters: -------------------------------- Language: {lang} Database: {dbtype} Clear database: {clear} File count: {file_count}
def m4detect(lang, json_in, seed_fn, invoke_parser=False, extend_seeds=False, **kw): """Metaphor detection using the seed system. :param lang: language (one of 'en', 'es', 'ru', 'fa') :param json_in: the json document object (a dict) containing at least a 'sentences' key :param seed_fn: a list of seeds :param invoke_parser: invoke parser on the sentences in the json doc :param extended_seeds: whether or not try to extend seeds (English only) :returrns: a json_in with a list of the found LMs appended to each sentence """ relations, json_out = extract(json_in, lang, invoke_parser) def counted(relation): return Counter((noun, verb) for rel, noun, verb in dependencies if rel == relation) tokenizer = parserdesc(lang).tokenizer def lm(sentence, relation, seed, noun_l, verb_l): """Outputs a LM with all the required keys. """ def offset(lemma, idx): "Finds offest of <lemma> in sentence." words = tokenizer(sentence) try: w = words[idx] word = w[0] if len(w) == 2 else w start = sentence.find(word) return dict(start=start, end=start + len(word)) except IndexError: dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:') dpprint((idx, words)) return dict(start=-1, end=-1) def dom(word, rel): return dict(offset(word.form, word.idx), lpos=u'{0[0]}.{0[1]}'.format(word.lemma), lemma=word.lemma[0], form=word.form[0], rel=rel) n_rel, v_rel = relation.split('-') noun, verb = rels[noun_l, verb_l] dprint('lm:', '\n noun', noun, '\n verb', verb) return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma), target=dom(noun, n_rel), source=dom(verb, v_rel), seed=u' '.join(u'%s.%s' % s for s in seed)) # TODO: optimization: this should be created once at the beginning. Perhaps on import? mfinder = MetaphorFinderEx(lang, seed_fn, extend_seeds) # TODO: this is inefficient: Python will evaluate arguments anyway # dprint('All possible metaphors:') # dforeach(partial(dump, indent=1), sorted(mfinder.mbuilder.metaphors)) # relations grouped by sentence id depsbysent = groupby(relations, key=lambda (sent_id, _): sent_id) sentences = json_out['sentences'] for i, deps in ((i - 1, list(deps)) for i, deps in depsbysent): # index deps by <noun-lemma, verb-lemma> pairs rels = dict(((n_l, v_l), (Struct(lemma=n_l, form=n_f, idx=int(n_idx)), Struct(lemma=v_l, form=v_f, idx=int(v_idx)))) for _, (n_idx, v_idx, _, n_f, n_l, v_f, v_l) in deps) mets = mfinder.find(rels.keys()) sent = sentences[i] dprint('_' * 96, '\n', sent['text']) dforeach(partial(dump, indent=1), rels.keys()) lms = [ lm(sent['text'], rel, seed, noun_l, verb_l) for (rel, seed, (noun_l, verb_l)) in mets ] dprint('found LMs:', pformat(lms)) sent['lms'] = lms jsonout = dict((k, v) for k, v in json_in.items() if k != 'sentences') jsonout['sentences'] = sentences return jsonout