def main(): """ Runs affect computation. """ # ------------------------------------------------------------------- # # INITIALIZATION m4test = ix.IARPATestCommand( 'metaa', 'Computes LM affect in terms of polarity and intensity.') cmdline = m4test.parseCmdLine() jdata = m4test.getJSON() # ------------------------------------------------------------------- # # MAIN APPLICATION LOGIC lang = jdata['lang'] if lang != 'fa': tt = mnjson.MNTreeTagger(lang) tt.cleanText(jdata['sentences']) tt.run(jdata['sentences']) tt.processLMs(jdata['sentences']) esfilterwords = set([ 'a', 'desde', 'detrás', 'ante', 'en', 'segun', 'bajo', 'entre', 'sin', 'con', 'hacia', 'sobre', 'contra', 'hasta', 'la', 'el', 'los', 'tras', 'de', 'por', 'para' ]) aff_system = AffectLookup(jdata['lang'], cmdline.extdir) for sent in jdata['sentences']: for lm in sent['lms']: tg = lm['target'] sc = lm['source'] tlemma = tg['lemma'] if 'lemma' in tg else tg['form'] slemma = sc['lemma'] if 'lemma' in sc else sc['form'] affect = aff_system.getLMAffect(tlemma.lower(), slemma.lower()) if affect == 999: if lang == 'es': if ' ' in tlemma: tlist = [] for w in tlemma.split(): if w in esfilterwords: continue tlist.append(w) tlemma = u' '.join(tlist) if ' ' in slemma: slist = [] for w in slemma.split(): if w in esfilterwords: continue slist.append(w) slemma = u' '.join(slist) affect = aff_system.getLMAffect(tlemma.lower(), slemma.lower()) lm['affect'] = affect # ------------------------------------------------------------------- # # OUTPUT FILE GENERATION m4test.writeOutput(jdata)
def computePOS(lang, sentences): """ Compute POS tags and add them under a 'word' node in each sentence. The 'word' node is a list of dicts, where each describes a word in the sentence. Uses TreeTagger for EN, ES, and RU, and a custom HMM tagger for FA. :param sentences: list of sentences: :type sentences: str """ if lang == 'fa': pt = PersianPOSTagger() for sent in sentences: sent['ctext'] = pt.cleanText(sent['text']) tags = pt.run_hmm_tagger(sent['ctext']) #print 'sentence %d: %s\n%s' % (sidx, sent['text'],pprint.pformat(tags)) sent['word'] = pt.getWordList(sent['text'], sent['ctext'], tags, 'pos', 'lem') else: tt = mnjson.MNTreeTagger(lang) tt.cleanText(sentences) tt.run(sentences)
def __init__(self, exdir=None, wldir=None, cxndir=None, verbose=False): global TAGGER_LANGNAME, DOMAIN if 'MNEXTRACTPATH' in os.environ: self.exdir = os.environ['MNEXTRACTPATH'] if exdir: self.exdir = exdir self.wldir = self.exdir + '/wordlists' self.cxndir = self.exdir + '/cxns' if wldir: self.wldir = wldir if cxndir: self.cxndir = cxndir self.verbose = verbose # setup taggers for all supported langs # pre-load all wordlists cxn lists for l in TAGGER_LANGNAME: if TAGGER_LANGNAME[l]: self.taggers[l] = mnjson.MNTreeTagger(l) else: self.taggers[l] = None wldir = self.wldir + '/' + l + '/' cxndir = self.cxndir + '/' + l + '/' tfile = wldir + "target." + DOMAIN sfile = wldir + "source." + DOMAIN cfile = cxndir + "cxns." + DOMAIN if os.path.exists(tfile) and os.path.exists(sfile): (dtwlist, dtwlists, dswlists) = self.get_domained_wordlists(tfile, sfile) self.twlist_by_lang[l] = dtwlist self.twlists_by_lang[l] = dtwlists self.swlists_by_lang[l] = dswlists self.old_twlist_by_lang[l] = self.get_wordlist(tfile) self.old_swlist_by_lang[l] = self.get_wordlist(sfile) self.tword_rank_by_lang[l] = self.get_ranked_wordlist(tfile) if os.path.exists(cfile): (dcxns, dcxn_ranking) = self.get_cxns(cfile) self.cxns_by_lang[l] = dcxns self.cxn_ranks_by_lang[l] = dcxn_ranking
def main(): """ Runs LM to concept mapping. """ global REMAP_CONCEPT # ------------------------------------------------------------------- # # INITIALIZATION m4test = ix.IARPATestCommand('metam', 'Map LMs to target and source concepts.') # add some custom cmdline parameters aparser = m4test.getArgParser() cmdline, config = m4test.parseCmdLineConfig('m4mapping') in_jdata = m4test.getJSON() # ------------------------------------------------------------------- # # MAIN APPLICATION LOGIC lang = in_jdata['lang'] mappingsystems = config.getList('mappingsystems', lang=lang) if not mappingsystems: mappingsystems = ['CNMS', 'DSMS', 'DLS'] secondaryMappingThreshold = config.getFloat('secondarymappingthreshold', lang=lang, default=0.1) secondaryMinScore = config.getFloatFromComp('cnms', 'secondaryminscore', lang=lang, default=0.1) mappingLimit = config.getIntFromComp('cnms', 'sourcelimit', lang=lang, default=2) if secondaryMappingThreshold: m4test.setSecondaryMappingThreshold(secondaryMappingThreshold) conceptrank = config.getListFromComp('cnms', 'targetconceptranking', lang=lang) expansionTypes = config.getListFromComp('cnms', 'expansiontypes', lang=lang) expansionScoreScale = config.getFloatFromComp('cnms', 'expansionscorescale', lang=lang, default=1.0) dsmsdefaultrank = config.getIntFromComp('dsms', 'defaultrank', lang=lang, default=2) dsmsdefaultscore = config.getFloatFromComp('dsms', 'defaultscore', lang=lang, default=0.10) dsmsScoreStr = ':%s:%s' % (dsmsdefaultrank, dsmsdefaultscore) # initialize CNMS system # this is always used at least for target concept lookups cnmap = ConceptualNetworkMapper(in_jdata['lang'], cmdline.cachedir, useSE=cmdline.useSE, govOnly=True, disableFN=True, targetConceptRank=conceptrank, expansionTypes=expansionTypes, expansionScoreScale=expansionScoreScale) # ------------------------------------------------------------------- # # Invoke here the parser and add tags to the sentences element of the JSON input in_sentences = in_jdata['sentences'] # run POS/Lemmatizer for all languages except Persian (CNMS) if (lang != 'fa'): tt = mnjson.MNTreeTagger(lang) tt.cleanText(in_sentences) tt.run(in_sentences) tt.processLMs(in_sentences) # run dependency parser for Englishjunk if (lang in ('en', 'ru', 'es')) and ('DSMS' in mappingsystems): ss = [s['ctext'] for s in in_sentences] logger.info('begin parsing sentence block, lang: %s, len: %d' % (lang, len(ss))) out_jdata = parse(in_jdata['lang'], ss) logger.info('end parsing sentence block') mapping = Assigner(lang) else: out_jdata = in_jdata currentTestItem = '' parser_name = parserdesc(lang).name # XXX makes no sense! # for in_sent, parsed_sent, in_sent in zip(in_sentences, out_jdata['sentences'], in_jdata['sentences']): for in_sent, parsed_sent in zip(in_sentences, out_jdata['sentences']): testItemId = in_sent['id'].split(u':')[1] if testItemId != currentTestItem: currentTestItem = testItemId logger.warn('mapping sentences in %s', currentTestItem) if 'lms' not in in_sent: continue for lm in in_sent['lms']: source, target = lm['source'], lm['target'] # =============================================================== # TARGET CONCEPT MAPPING: ALWAYS USE CNMS # =============================================================== cnmap.runTargetMapping(lm) lm['extractor'] = 'CNMS' # remap targetconcepts if needed. this is a hack to deal with # IARPA's inconsistency about concept coverage if target.get('concept') in REMAP_CONCEPT: target['concept'] = REMAP_CONCEPT[target['concept']] # ================================================================ # CNMS # ================================================================ if 'CNMS' in mappingsystems: cnmap.runSourceMapping(lm, sourceMappingLimit=mappingLimit, minSecondaryScore=secondaryMinScore) # ================================================================ # DSMS MAPPING SYSTEM (formerly KMS) # ================================================================ if ((source.get('concept') in (None, 'NULL', 'NONE', '')) and ('DSMS' in mappingsystems) and (lang in ('en', 'ru', 'es'))): target_f = target['form'] if 'form' in target else target[ 'lemma'] source_f = source['form'] if 'form' in target else source[ 'lemma'] found_lms = False words = sorted(parsed_sent['word'], key=lambda w: w['idx']) twords = sorted(in_sent['word'], key=lambda w: w['idx']) # logger.info(pformat(in_sent['word'])) # Try looking for a relation first relations = parsed_sent[parser_name]['relations'] found_lms = find_lm5(target_f, source_f, relations) if not found_lms: found_lms = find_lm3(target_f, source_f, twords) # if not found_lms: # found_lms = find_lm4(target, source, words) logger.debug('DSMS: found_lms: %s' % found_lms) if found_lms: target_l, source_l, _r = found_lms[0] target['rlemma'] = target_l source['rlemma'] = source_l if _r != '-': r = _r.split('.')[0] if '.' in _r else _r dimensions = mapping.assign2(source_l, r) else: dimensions = mapping.gassign(source_l, target_l) scon = dimensions[0].upper() if dimensions else None else: scon = None target_l = target[ 'lemma'] if 'lemma' in target else target['form'] source_l = source[ 'lemma'] if 'lemma' in source else source['form'] # dd = ', '.join(' '.join(d) for d in deprels(words)) # log('could not find %s - %s in %s' % (target_f, source_f, dd)) source['concept'] = scon + dsmsScoreStr if scon else 'NONE' if scon: if source.get('extractor'): source['extractor'] += ':DSMS' else: source['extractor'] = 'DSMS' # ------------------------------------------------------------------- # # OUTPUT FILE GENERATION m4test.writeOutput(in_jdata)
def main(): """ Runs source dimension identification. """ # ------------------------------------------------------------------- # # INITIALIZATION m4test = ix.IARPATestCommand( 'metas', 'Map LMs with concepts to source dimensions.') cmdline = m4test.parseCmdLine() jdata = m4test.getJSON() # Run the parser # parsed_jdata = parse(jdata['lang'], [s['text'] for s in jdata['sentences']]) def lemma(source): pass # ------------------------------------------------------------------- # # MAIN APPLICATION LOGIC lang = jdata['lang'] in_sentences = jdata['sentences'] if lang != 'fa': tt = mnjson.MNTreeTagger(lang) tt.cleanText(in_sentences) tt.run(in_sentences) tt.processLMs(in_sentences) cnmap = ConceptualNetworkMapper(lang, cmdline.cachedir) for sent in jdata['sentences']: if 'lms' not in sent: continue for lm in sent['lms']: # Note that dimension here is in the form # CONCEPT.Dimension, e.g. DISEASE.Type. An UNRESOLVED problem here # for the GMR system is what happens when the CONCEPT part we # calculated doesn't match with what IARPA provides in the XML. # INTEGRATE NEW SYSTEM HERE! source, target = lm['source'], lm['target'] source_f = source['form'] target_f = target['form'] source_l = source['lemma'] if 'lemma' in source else source['form'] target_l = target['lemma'] if 'lemma' in target else target['form'] source_pos = source['pos'] if 'pos' in source else '' target_pos = target['pos'] if 'pos' in target else '' sschemas, sourceconceptdim = cnmap.getSourceSchemasAndDimensionFromLemma( source_l, source_pos) if not sourceconceptdim: sschemas, sourceconceptdim = cnmap.getSourceSchemasAndDimensionFromLemma( source_f) if not USE_DARIOS or lang == 'fa': source['dimension'] = sourceconceptdim lm['extractor'] = 'WMS' else: source_c = source['concept'].lower() ((_, dim, sdim), confident) = subdim_match(_lang[lang], source_l, target_l, source_c) sd_pair = u'%s.%s' % (dim.upper(), capwords(sdim, '_')) source[ 'dimension'] = sd_pair if confident else sourceconceptdim lm['extractor'] = 'DMS' if confident else 'WMS' source['schemas'] = sschemas # ------------------------------------------------------------------- # # OUTPUT FILE GENERATION m4test.writeOutput(jdata)