예제 #1
0
    def add_lexical_relations(self,
                              table,
                              restriction=["N", "V"],
                              table_type="voisins",
                              stop_words=set([])):
        """precompute lexical relations for each token 
	in corpus, according to provided table
	
	table should be indexed on combo of lemma/pos/relation type
	and yield a real value
	
	TODO: 
	x- each word should be there with itself at similarity = 1.0
	-if no restriction, all related tokens are extracted. 
	should be adjusted wrt type and location: 
		- tokens in same text
		- tokens in corpus/subcorpus
		- type: part of speech, grammatical relations if available
	"""
        self._voisins = defaultdict(list)
        self._vsn_domain = restriction
        for doc in self._docs.values():
            vocab = set([
                x.lemma() for x in doc._prep._tokens.values()
                if x.simple_mp_cat() in restriction
                and not (self._voisins.has_key(x.lemma()))
            ])
            vocab = vocab - stop_words
            print >> sys.stderr, "looking up additional %d words" % len(vocab)
            if table_type == "voisins":
                self._voisins.update(get_voisins_dict(table, vocab))
            elif table_type == "synos":
                self._voisins.update(get_syno_norm_dict(vocab, table))
            else:
                print >> sys.stderr, "unimplemented lexical resource type, use 'voisins' or 'synos'", table_type
                sys.exit(0)
            for word in vocab:
                self._voisins[word].append((word, 1.0))

    # also store rank of neighbours in each other list
        self._ranked_vsn = {}
        for entry in self._voisins:
            self._ranked_vsn[entry] = [
                (x[1], 1.0 / (i + 1)) for (i, x) in enumerate(
                    sorted([(-s, w) for (w, s) in self._voisins[entry]]))
            ]
            self._ranked_vsn[entry] = dict(self._ranked_vsn[entry])
            self._voisins[entry] = dict(self._voisins[entry])
예제 #2
0
    def add_lexical_relations(self,table,restriction=["N","V"],table_type="voisins",stop_words=set([])):
	"""precompute lexical relations for each token 
	in corpus, according to provided table
	
	table should be indexed on combo of lemma/pos/relation type
	and yield a real value
	
	TODO: 
	x- each word should be there with itself at similarity = 1.0
	-if no restriction, all related tokens are extracted. 
	should be adjusted wrt type and location: 
		- tokens in same text
		- tokens in corpus/subcorpus
		- type: part of speech, grammatical relations if available
	"""
	self._voisins = defaultdict(list)
	self._vsn_domain = restriction
	for doc in self._docs.values():	
		vocab = set([x.lemma() for x in doc._prep._tokens.values() if x.simple_mp_cat() in restriction and not(self._voisins.has_key(x.lemma()))])
		vocab = vocab - stop_words
                print >> sys.stderr, "looking up additional %d words"%len(vocab)
		if table_type=="voisins":
                    self._voisins.update(get_voisins_dict(table,vocab))
                elif table_type=="synos":
                    self._voisins.update(get_syno_norm_dict(vocab,table))
                else:
                    print >> sys.stderr, "unimplemented lexical resource type, use 'voisins' or 'synos'", table_type
                    sys.exit(0)
                for word in vocab:
                    self._voisins[word].append((word,1.0))
        # also store rank of neighbours in each other list
        self._ranked_vsn = {}
	for entry in self._voisins:
            self._ranked_vsn[entry] = [(x[1],1.0/(i+1)) for (i,x) in enumerate(sorted([(-s,w) for (w,s) in self._voisins[entry]]))]
            self._ranked_vsn[entry] = dict(self._ranked_vsn[entry])
            self._voisins[entry]=dict(self._voisins[entry])
예제 #3
0
    if options.merge:
        feats = FeatureMap("", empty=True)
        feats.init_from_dir(".", suffix=options.merge)
        basename = "no base file to consider"
    else:
        if True:
            basename = args[0]
            feats = FeatureMap(basename + ".features", weird=options.weird)
            feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2])
            if allfuncs != {}:
                doc = annodisAnnot(basename + ".xml")
                prep = Preprocess(basename + ".txt.prep.xml")
                doc.add_preprocess(prep)
            if options.voisins:
                doc._voisins = get_voisins_dict(table, doc._vocab)
                for entry in doc._voisins:
                    doc._voisins[entry] = dict(doc._voisins[entry])
        else:
            print >> sys.stderr, "Usage: script file-basename ?", args
            sys.exit(0)

    if options.merge:
        feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2, "m#FILE"])
    else:
        feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2])

    for onename, onefunc in allfuncs.items():
        feats.process(doc,
                      onefunc,
                      propagate=options.simple,
예제 #4
0
    if options.merge:
        feats = FeatureMap("", empty = True)
        feats.init_from_dir(".", suffix = options.merge)
        basename = "no base file to consider"
    else:
        if True:
            basename = args[0]
            feats = FeatureMap(basename + ".features", weird = options.weird)
            feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2])
            if allfuncs != {}:
                doc = annodisAnnot(basename + ".xml")
                prep = Preprocess(basename + ".txt.prep.xml")
                doc.add_preprocess(prep)
            if options.voisins:
                doc._voisins = get_voisins_dict(table, doc._vocab)
                for entry in doc._voisins:
                    doc._voisins[entry] = dict(doc._voisins[entry])
        else:
            print >> sys.stderr, "Usage: script file-basename ?", args
            sys.exit(0)

    if options.merge:
        feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2, "m#FILE"])
    else:
        feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2])

    for onename, onefunc in allfuncs.items():
        feats.process(doc, onefunc, propagate = options.simple, strand_orphans = options.strand_orphans)
        print >> sys.stderr, onename, " done"
    if options.distance: