예제 #1
0
 def extract(self, sentences):
     pdesc = parserdesc(self.lang)
     process, keep = pdesc.config
     # Patch: the RASP pos is in rpos
     key = Struct(form='form',
                  pos=('rpos' if pdesc.name == 'rasp' else 'pos'),
                  lem='lem')
     recs = []
     for _, deps in JsonDepBuilder(key=key).build(sentences, keep=keep):
         recs.extend([(sid, process(dep)) for sid, dep in deps])
     return recs
예제 #2
0
def parser_for(lang):
    """Create a Parser object for a specific language.

    :param lang: one of 'en', 'es', 'ru', 'fa'.
    :returns: a Parser object.
    """
    pdesc = parserdesc(lang)
    return Parser(name=pdesc.name,
                  config=pdesc.config,
                  command=pdesc.command,
                  debug=True,
                  encoding='utf-8')
예제 #3
0
def main():
    """
    Runs LM to concept mapping.
    """
    global REMAP_CONCEPT
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    m4test = ix.IARPATestCommand('metam',
                                 'Map LMs to target and source concepts.')

    # add some custom cmdline parameters
    aparser = m4test.getArgParser()
    cmdline, config = m4test.parseCmdLineConfig('m4mapping')
    in_jdata = m4test.getJSON()

    # ------------------------------------------------------------------- #
    # MAIN APPLICATION LOGIC

    lang = in_jdata['lang']
    mappingsystems = config.getList('mappingsystems', lang=lang)
    if not mappingsystems:
        mappingsystems = ['CNMS', 'DSMS', 'DLS']
    secondaryMappingThreshold = config.getFloat('secondarymappingthreshold',
                                                lang=lang,
                                                default=0.1)
    secondaryMinScore = config.getFloatFromComp('cnms',
                                                'secondaryminscore',
                                                lang=lang,
                                                default=0.1)
    mappingLimit = config.getIntFromComp('cnms',
                                         'sourcelimit',
                                         lang=lang,
                                         default=2)
    if secondaryMappingThreshold:
        m4test.setSecondaryMappingThreshold(secondaryMappingThreshold)
    conceptrank = config.getListFromComp('cnms',
                                         'targetconceptranking',
                                         lang=lang)
    expansionTypes = config.getListFromComp('cnms',
                                            'expansiontypes',
                                            lang=lang)
    expansionScoreScale = config.getFloatFromComp('cnms',
                                                  'expansionscorescale',
                                                  lang=lang,
                                                  default=1.0)
    dsmsdefaultrank = config.getIntFromComp('dsms',
                                            'defaultrank',
                                            lang=lang,
                                            default=2)
    dsmsdefaultscore = config.getFloatFromComp('dsms',
                                               'defaultscore',
                                               lang=lang,
                                               default=0.10)
    dsmsScoreStr = ':%s:%s' % (dsmsdefaultrank, dsmsdefaultscore)

    # initialize CNMS system
    # this is always used at least for target concept lookups
    cnmap = ConceptualNetworkMapper(in_jdata['lang'],
                                    cmdline.cachedir,
                                    useSE=cmdline.useSE,
                                    govOnly=True,
                                    disableFN=True,
                                    targetConceptRank=conceptrank,
                                    expansionTypes=expansionTypes,
                                    expansionScoreScale=expansionScoreScale)

    # ------------------------------------------------------------------- #
    # Invoke here the parser and add tags to the sentences element of the JSON input
    in_sentences = in_jdata['sentences']

    # run POS/Lemmatizer for all languages except Persian (CNMS)
    if (lang != 'fa'):
        tt = mnjson.MNTreeTagger(lang)
        tt.cleanText(in_sentences)
        tt.run(in_sentences)
        tt.processLMs(in_sentences)

    # run dependency parser for Englishjunk
    if (lang in ('en', 'ru', 'es')) and ('DSMS' in mappingsystems):
        ss = [s['ctext'] for s in in_sentences]
        logger.info('begin parsing sentence block, lang: %s, len: %d' %
                    (lang, len(ss)))
        out_jdata = parse(in_jdata['lang'], ss)
        logger.info('end parsing sentence block')
        mapping = Assigner(lang)
    else:
        out_jdata = in_jdata

    currentTestItem = ''
    parser_name = parserdesc(lang).name
    # XXX makes no sense!
    #     for in_sent, parsed_sent, in_sent in zip(in_sentences, out_jdata['sentences'], in_jdata['sentences']):
    for in_sent, parsed_sent in zip(in_sentences, out_jdata['sentences']):
        testItemId = in_sent['id'].split(u':')[1]
        if testItemId != currentTestItem:
            currentTestItem = testItemId
            logger.warn('mapping sentences in %s', currentTestItem)

        if 'lms' not in in_sent:
            continue

        for lm in in_sent['lms']:
            source, target = lm['source'], lm['target']
            # ===============================================================
            # TARGET CONCEPT MAPPING: ALWAYS USE CNMS
            # ===============================================================
            cnmap.runTargetMapping(lm)
            lm['extractor'] = 'CNMS'

            # remap targetconcepts if needed.  this is a hack to deal with
            # IARPA's inconsistency about concept coverage
            if target.get('concept') in REMAP_CONCEPT:
                target['concept'] = REMAP_CONCEPT[target['concept']]

            # ================================================================
            # CNMS
            # ================================================================
            if 'CNMS' in mappingsystems:
                cnmap.runSourceMapping(lm,
                                       sourceMappingLimit=mappingLimit,
                                       minSecondaryScore=secondaryMinScore)

            # ================================================================
            # DSMS MAPPING SYSTEM (formerly KMS)
            # ================================================================

            if ((source.get('concept') in (None, 'NULL', 'NONE', ''))
                    and ('DSMS' in mappingsystems)
                    and (lang in ('en', 'ru', 'es'))):
                target_f = target['form'] if 'form' in target else target[
                    'lemma']
                source_f = source['form'] if 'form' in target else source[
                    'lemma']
                found_lms = False

                words = sorted(parsed_sent['word'], key=lambda w: w['idx'])
                twords = sorted(in_sent['word'], key=lambda w: w['idx'])

                #                 logger.info(pformat(in_sent['word']))

                # Try looking for a relation first
                relations = parsed_sent[parser_name]['relations']
                found_lms = find_lm5(target_f, source_f, relations)

                if not found_lms:
                    found_lms = find_lm3(target_f, source_f, twords)

#                 if not found_lms:
#                     found_lms = find_lm4(target, source, words)

                logger.debug('DSMS: found_lms: %s' % found_lms)

                if found_lms:
                    target_l, source_l, _r = found_lms[0]
                    target['rlemma'] = target_l
                    source['rlemma'] = source_l
                    if _r != '-':
                        r = _r.split('.')[0] if '.' in _r else _r
                        dimensions = mapping.assign2(source_l, r)
                    else:
                        dimensions = mapping.gassign(source_l, target_l)

                    scon = dimensions[0].upper() if dimensions else None
                else:
                    scon = None
                    target_l = target[
                        'lemma'] if 'lemma' in target else target['form']
                    source_l = source[
                        'lemma'] if 'lemma' in source else source['form']


#                     dd = ', '.join(' '.join(d) for d in deprels(words))
#                     log('could not find %s - %s in %s' % (target_f, source_f, dd))
                source['concept'] = scon + dsmsScoreStr if scon else 'NONE'
                if scon:
                    if source.get('extractor'):
                        source['extractor'] += ':DSMS'
                    else:
                        source['extractor'] = 'DSMS'

    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    m4test.writeOutput(in_jdata)
예제 #4
0
        update(self, f=f, **entries)
        
    def __call__(self, fn):
        self.f(fn, **vars(self))
        
if __name__ == '__main__':
    args = argparser().parse_args()

    def noext(fn):
        fn, _ = splitext(fn)
        return fn

    logging.basicConfig(level=logging.INFO)
    
    fnames      = set(map(noext, flattened(iglob(a) for a in args.f)))
    desc        = parserdesc(args.lang)
    config      = desc.config
#     txtor     = make_translator(desc.name, config[0], config[1])
    txtor_args  = (desc.name, config[0], config[1])
#     writer      = make_writer(args.dbtype, 'test', args.clear, args.lang)
    writer_args = (args.dbtype, 'test', args.clear, args.lang)
#     process_fn  = partial(process, writer_args=writer_args, txtor=txtor, chunk=args.chunk)
    process_fn  = Partial(process, writer_args=writer_args, txtor_args=txtor_args, chunk=args.chunk)

    msg = """\
        Running parameters:
        --------------------------------
        Language:           {lang}
        Database:           {dbtype}
        Clear database:     {clear}
        File count:         {file_count}
예제 #5
0
def m4detect(lang,
             json_in,
             seed_fn,
             invoke_parser=False,
             extend_seeds=False,
             **kw):
    """Metaphor detection using the seed system.

    :param lang: language (one of 'en', 'es', 'ru', 'fa')
    :param json_in: the json document object (a dict) containing at least a 'sentences' key
    :param seed_fn: a list of seeds
    :param invoke_parser: invoke parser on the sentences in the json doc
    :param extended_seeds: whether or not try to extend seeds (English only)
    :returrns: a json_in with a list of the found LMs appended to each sentence
    """
    relations, json_out = extract(json_in, lang, invoke_parser)

    def counted(relation):
        return Counter((noun, verb) for rel, noun, verb in dependencies
                       if rel == relation)

    tokenizer = parserdesc(lang).tokenizer

    def lm(sentence, relation, seed, noun_l, verb_l):
        """Outputs a LM with all the required keys.
        """
        def offset(lemma, idx):
            "Finds offest of <lemma> in sentence."
            words = tokenizer(sentence)
            try:
                w = words[idx]
                word = w[0] if len(w) == 2 else w
                start = sentence.find(word)
                return dict(start=start, end=start + len(word))
            except IndexError:
                dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:')
                dpprint((idx, words))
                return dict(start=-1, end=-1)

        def dom(word, rel):
            return dict(offset(word.form, word.idx),
                        lpos=u'{0[0]}.{0[1]}'.format(word.lemma),
                        lemma=word.lemma[0],
                        form=word.form[0],
                        rel=rel)

        n_rel, v_rel = relation.split('-')
        noun, verb = rels[noun_l, verb_l]
        dprint('lm:', '\n  noun', noun, '\n  verb', verb)
        return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma),
                    target=dom(noun, n_rel),
                    source=dom(verb, v_rel),
                    seed=u' '.join(u'%s.%s' % s for s in seed))

    # TODO: optimization: this should be created once at the beginning. Perhaps on import?
    mfinder = MetaphorFinderEx(lang, seed_fn, extend_seeds)

    # TODO: this is inefficient: Python will evaluate arguments anyway
    #     dprint('All possible metaphors:')
    #     dforeach(partial(dump, indent=1), sorted(mfinder.mbuilder.metaphors))

    # relations grouped by sentence id
    depsbysent = groupby(relations, key=lambda (sent_id, _): sent_id)
    sentences = json_out['sentences']
    for i, deps in ((i - 1, list(deps)) for i, deps in depsbysent):
        # index deps by <noun-lemma, verb-lemma> pairs
        rels = dict(((n_l, v_l), (Struct(lemma=n_l, form=n_f, idx=int(n_idx)),
                                  Struct(lemma=v_l, form=v_f, idx=int(v_idx))))
                    for _, (n_idx, v_idx, _, n_f, n_l, v_f, v_l) in deps)
        mets = mfinder.find(rels.keys())
        sent = sentences[i]
        dprint('_' * 96, '\n', sent['text'])
        dforeach(partial(dump, indent=1), rels.keys())
        lms = [
            lm(sent['text'], rel, seed, noun_l, verb_l)
            for (rel, seed, (noun_l, verb_l)) in mets
        ]
        dprint('found LMs:', pformat(lms))
        sent['lms'] = lms

    jsonout = dict((k, v) for k, v in json_in.items() if k != 'sentences')
    jsonout['sentences'] = sentences
    return jsonout