Exemplo n.º 1
0
def make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname):
    reverse_vocab = [lemma.decode("utf-8") for lemma in samp_hdfile["vocab"]]
    vocab = dict((lemma, i) 
                 for i, lemma in enumerate(reverse_vocab))
    assert len(reverse_vocab) == len(vocab)
    
    tdict = TransDict.load(tdict_pkl_fname)
    # disable POS mapping
    tdict.pos_map = None
    
    reverse_tdict = TransDict.load(reverse_tdict_pkl_fname)
    reverse_tdict.pos_map = None
    
    shape = len(vocab), len(vocab)
    log.info("making extension matrix as sparse lil_matrix {0}".format(shape))
    em = sp.lil_matrix(shape, dtype="int8")
    
    for i, target_lemma in enumerate(reverse_vocab):
        try:
            reverse_lookup = reverse_tdict.lookup_lemma(target_lemma)
        except KeyError:
            # vocab term not in reverse dict
            # FIXME: these terms should be removed from vocab
            continue
        
        log.debug(40 * "=")
        
        for _, source_lempos_list in reverse_lookup: 
            for source_lempos in source_lempos_list:
                target_lempos_list = tdict.lookup_lempos(source_lempos)[1]
                for target_lempos in target_lempos_list:
                    # does not handle MWU, but vocab contains only atomic
                    # lemmas so far
                    ext_target_lemma = target_lempos.rsplit("/",1)[0]
                    try:
                        j = vocab[ext_target_lemma]
                    except:
                        # oov
                        continue
                    
                    log.debug(u"{0} --> {1} --> {2}".format(
                        target_lemma,
                        source_lempos, 
                        ext_target_lemma))
                    # counting occurrences does not make a a lot of sense,
                    # so assume boolean
                    em[i,j] = 1
                    
        if log.isEnabledFor(logging.DEBUG):
            log.debug(u"{0} ==> {1}".format(
                target_lemma,
                ", ".join([str((reverse_vocab[j], count)) for j, count in zip(em.rows[i], em.data[i])])))
                   
    log.info("converting to csr_matrix") 
    return em.tocsr()
Exemplo n.º 2
0
def make_new_vocab(sample_hdfile, tdict_pkl_fname):
    tdict = TransDict.load(tdict_pkl_fname)
    # disable POS mapping
    tdict.pos_map = None
    
    log.info("extracting target lemmas from translation dictionary")
    dict_target_lemmas = set()
    
    for target_lempos_list in tdict._lempos_dict.itervalues():
        for target_lempos in target_lempos_list:
            # skip MWU
            if not " " in target_lempos:
                target_lemma = target_lempos.rsplit("/",1)[0]
                dict_target_lemmas.add(target_lemma)
        
    del tdict
    
    vocab = [t.decode("utf-8") for t in sample_hdfile["vocab"]]
    
    # select columns numbers and corresponding target lemmas
    # sorting is required because order of column number is relevant    
    selection = [ (i, lemma) 
                  for i, lemma in enumerate(vocab)
                  if lemma in dict_target_lemmas ]
    
    columns_selector, filtered_vocab = zip(*selection)
    return columns_selector, filtered_vocab
Exemplo n.º 3
0
def make_new_vocab(sample_hdfile, tdict_pkl_fname):
    tdict = TransDict.load(tdict_pkl_fname)
    # disable POS mapping
    tdict.pos_map = None

    log.info("extracting target lemmas from translation dictionary")
    dict_target_lemmas = set()

    for target_lempos_list in tdict._lempos_dict.itervalues():
        for target_lempos in target_lempos_list:
            # skip MWU
            if not " " in target_lempos:
                target_lemma = target_lempos.rsplit("/", 1)[0]
                dict_target_lemmas.add(target_lemma)

    del tdict

    vocab = [t.decode("utf-8") for t in sample_hdfile["vocab"][()]]
    org_size = len(vocab)
    log.info("orginal vocab size: {} lemmas".format(org_size))

    # select columns numbers and corresponding target lemmas
    # sorting is required because order of column number is relevant
    selection = [(i, lemma) for i, lemma in enumerate(vocab) if lemma in dict_target_lemmas]

    columns_selector, filtered_vocab = zip(*selection)

    new_size = len(filtered_vocab)
    log.info("filtered vocab size: {} lemmas".format(new_size))
    reduction = (new_size / float(org_size)) * 100
    log.info("vocab reduced to {:.2f}% of orginal size".format(reduction))

    return columns_selector, filtered_vocab
Exemplo n.º 4
0
def preprocess(data_set, lang_pair):
    source_lang, target_lang = lang_pair.split("-")
    graphs_fname = config["eval"][data_set][lang_pair]["graphs_fname"]
    out_dir = os.path.dirname(graphs_fname)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
        
    # annotate
    annotator = get_annotator(source_lang)
    graph_list = annotator.annot_xml_file(
        config["eval"][data_set][lang_pair]["src_fname"])
    
    # lookup translations
    dict_fname = TransDict.load(config["dict"][lang_pair]["pkl_fname"])
    lookup = Lookup(dict_fname)
    lookup(graph_list)
    
    # score most frequent translation
    freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"])
    freq_score(graph_list)
    
    # dict upper scores  
    lemma_ref_fname = \
        config["eval"][data_set][lang_pair]["lemma_ref_fname"]
    scorer = DictUpperScorer(lemma_ref_fname)
    scorer(graph_list)
    
    # model upper scores  
    ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
    filter = filter_functions(source_lang)
    scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
    scorer(graph_list)
    
    # save graphs
    log.info("saving preprocessed graphs to " + graphs_fname)
    cPickle.dump(graph_list, open(graphs_fname, "wb"))
    
Exemplo n.º 5
0
def prepare(lang_pair):
    """
    extract input text, create annotated graphs, lookup translation
    candidates, perform frequency scoring and save pickled graphs to file
    """
    source_lang, target_lang = lang_pair.split("-")
    
    # get text from input source
    xml_tree = et.ElementTree(file=config["eval"]["presemt"][lang_pair]["src_fname"])
    text = " ".join(seg.text.strip() for seg in  xml_tree.iter("seg"))

    # annotate
    if source_lang == "en":
        annotator = TreeTaggerEnglish()
    elif source_lang == "de":
        annotator = TreeTaggerGerman()
    else:
        raise ValueError("unknown source language: " + source_lang)
    graph_list = annotator(text)

    # lookup
    trans_dict = TransDict.load(config["dict"][lang_pair]["pkl_fname"])
    lookup = Lookup(trans_dict)
    lookup(graph_list)

    # frequency scoring
    freq_score = FreqScorer(config["count"]["lemma"][target_lang]["pkl_fname"])
    freq_score(graph_list)

    # save
    if not os.path.exists(PREP_DIR):
        os.makedirs(PREP_DIR)
    pkl_fname = join(PREP_DIR, lang_pair + "_graphs.pkl")
    log.info("saving graphs to " + pkl_fname)
    cPickle.dump(graph_list, 
                 open(pkl_fname, "wb"),
                 protocol=cPickle.HIGHEST_PROTOCOL)
Exemplo n.º 6
0
 def setup_class(cls):
     dict_fname = config["dict"]["en-de"]["pkl_fname"]
     print "loading picked dictionary from " + dict_fname
     cls.trans_dict = TransDict.load(dict_fname)
Exemplo n.º 7
0
 def setup_class(cls):
     dict_fname = config["dict"]["en-de"]["pkl_fname"]
     print "loading picked dictionary from " + dict_fname
     cls.trans_dict = TransDict.load(dict_fname)
     # remove the POS mapping
     cls.trans_dict.pos_map = None
Exemplo n.º 8
0
def make_graphs():
    """
    Create annotated translations graphs with scores for random translation,
    most frequent translation and approximated maximum. Also create minimal
    translation dictionaries for these graphs and drawings.
    """
    for lang_pair, src_fname, lemma_ref_fname in [ 
        ("en-de", 
         "sample_newstest2011-src.en.sgm", 
         "lemma_sample_newstest2011-ref.de.sgm"),
        ("de-en", 
         "sample_out_de-en.src", 
         "lemma_sample_out_de-en.ref") ]:
        source_lang, target_lang = lang_pair.split("-")
        root_fname = splitext(src_fname)[0]
        
        # annotate
        annotator = get_annotator(source_lang)
        graphs = annotator.annot_xml_file(src_fname)    
        
        # lookup
        dict_fname = config["dict"][lang_pair]["pkl_fname"]
        trans_dict = TransDict.load(dict_fname)
        lookup = LookupKeepKeys(trans_dict)
        lookup(graphs)
        
        #  write pickle of minimal translation dict
        min_dict = lookup.get_minimal_trans_dict()
        min_dict_fname = "dict_" + root_fname + ".pkl"
        dump(min_dict, open(min_dict_fname, "wb"))
        
        # score most frequent translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        freq_score = FreqScorer(counts_fname)
        freq_score(graphs)
        
        # score random translation
        counts_fname = config["count"]["lemma"][target_lang]["pkl_fname"]
        rand_score = RandScorer()
        rand_score(graphs)
        
        # dict upper score
        maxscore = DictUpperScorer(lemma_ref_fname)
        maxscore(graphs)
    
        # model upper scores  
        ambig_fname = config["sample"][lang_pair]["ambig_fname"]  
        filter = filter_functions(source_lang)
        scorer = ModelUpperScorer(lemma_ref_fname, ambig_fname, filter)
        scorer(graphs)
        
        # draw graphs
        draw = Draw()
        draw(graphs, out_format="pdf", 
             base_score_attrs=["dup_score", "mup_score", "freq_score", 
                               "rand_score"], 
             out_dir="_draw_" + lang_pair)
        
        # save graphs
        graphs_fname = "graphs_" + root_fname + ".pkl"
        dump(graphs, open(graphs_fname, "wb"))