예제 #1
0
def processData(args):
	data = dataset.Dataset.load(args.input_data, args.dataset)
	wvout = args.wvfile
	if os.path.exists(wvout):
		wordvecf = wvout
	else:
		wordvecf = args.wvsource

	features = {x for x in args.basefeatures.split(',') if x != ''}
	matchers = {x for x in args.matchers.split(',') if x != ''}

	printd("Loading Word Vectors")
	wordvec = WordVec(wordvecf)
	printd("Vectorizing")
	data.vectorize(wordvec)
	maxwords = data.maxShortSentence()

	if wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		data.vectorize(wordvec)

	conf.wvsize = wordvec.size

	# Train data
	printd("Computing basic WV Features")
	fs = FeatureSet(data, features)

	if "Pair" in matchers:
		printd("Computing Pair Features")
		matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "Shingle" in matchers:
		printd("Computing Shingle Features")
		matcher = Shingler(slop=12, lmbda=0.95)
		fs.addMatcher(matcher)

	vocab = None
	if "MinDistSim" in matchers:
		printd("Computing MinDist")
		vocab = fs.data.wv_vocab()
		data.weight()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos')
		printd("Computing MinDist-Euclidean")
		comparator = 'euclidean'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'euc')

	if "NGram" in matchers:
		printd("Computing MinDist-Ngram")
		vocab = fs.data.wv_vocab()
		if vocab is None:
			vocab = fs.data.wv_vocab()
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=2, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-bigram')
		comparator = 'cosine'
		matcher = vectorsim.MinDistSim(metric=comparator, df=vocab, maxsent=maxwords, ngram=3, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher, 'cos-trigram')

	if "WWSim" in matchers:
		printd("Computing WWSim")
		matcher = vectorsim.WWSim(wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	if "InfRankSim" in matchers:
		printd("Computing InfRankSim")
		matcher = vectorsim.InfRankSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		printd("InfRankSim Matching")
		fs.addMatcher(matcher)

	if "InfSim" in matchers:
		# We normalize after so primary features are raw word vectors
		# InfSim
		printd("Computing InfSim")
		wordvec.normalize()
		data.vectorize(wordvec)
		matcher = vectorsim.InfSim(data=data, wordvec=wordvec, dimfeatures=args.dimfeatures)
		fs.addMatcher(matcher)

	return fs
예제 #2
0
def main(args):
    global wordvec, wordvecf
    conf.debug = args.debug or args.verbose
    conf.verbose = args.verbose
    conf.args = args
    #nf = args.nuggets
    #uf = args.updates
    #mf = args.matches
    sf = args.shingles
    vf = args.wordvec
    #ef = args.evalfile
    wvout = args.wvfile
    sim_thr = args.sim_thr
    dset = args.dataset
    limit = args.limit

    #if args.dataset == "auto":
    #	if ef is not None:
    #		dset = "semeval"
    #	else:
    #		with open(glob.glob(nf)[0]) as nh:
    #			nfhead = nh.readline()
    #			if nfhead.startswith("query_id\tnugget_id"):
    #				dset = "ts"
    #			elif nfhead.startswith("query_id\tvs_id"):
    #				dset = "mclick"
    #			else:
    #				dset = "1click"

    if os.path.exists(wvout) and not args.force:
        wordvecf = wvout

    if vf:
        printd("Reading word vector...")
        #wordvec = load_wordvec()
        wordvec = WordVec(wordvecf)

    if args.sim == "minsim":
        matcher = MinDistSim
    elif args.sim == "infsim":
        matcher = InfSim
    else:
        matcher = VecSim

    if args.sim == "infsim" or args.comparator == "infsim":
        wordvec.normalize()

    #if dset == "ts":
    #	nuggfn = Nuggets
    #	updfn = Updates
    #	outfn = MatchWriter
    #elif dset == "1click":
    #	nuggfn = CLNuggets
    #	updfn = CLUpdates
    #	outfn = CLMatchWriter
    #elif dset == "mclick":
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter
    #elif dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	nuggfn = MCNuggets
    #	updfn = Updates
    #	outfn = MCMatchWriter

    data = Dataset.load(args.input_data, dset)
    if vf is not None:
        data.vectorize(wordvec)
    #if dset == "semeval":
    #	data = SemEvalDataset(args.input_data, args.evalfile)
    #	#outfn = data.writer
    #	if vf is not None:
    #		data.vectorize(wordvec)
    #else:
    #	printd("Processing Nuggets...")
    #	#nuggets = nuggfn(nf, vectorize=vf is not None)

    #	printd("Processing Updates...")
    #	#updates = updfn(uf, vectorize=vf is not None)
    #	#data = NuggetDataset(nuggets, updates, mf)
    #	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

    if vf and wvout is not None and wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        #if dset == "semeval":
        #	wv_toks = data.wv_sentences()
        #else:
        #	wv_toks = nuggets.wv_text() + updates.wv_text()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        if args.sim == "infsim" or args.comparator == "infsim":
            wordvec.normalize()
        data.vectorize(wordvec)
        with open(wvout + ".vocab", 'w') as wh:
            wh.write("\n".join(wordvec.vocab().keys()))
        with open(wvout + ".toks", 'w') as wh:
            wh.write("\n".join([" ".join(x) for x in wv_toks]))
        #vocab = nuggets.wv_vocab().union(updates.wv_vocab())
        #wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
        #wordvec.save(wvout)

    vocab = None
    if args.frequencies:
        try:
            with open(args.frequencies) as fh:
                vocab = json.load(fh)
            # For Term Frequencies instead of Document Frequencies
            # Could also do len(vocab[word]) if wanted to mimic DF
            if type(vocab.itervalues().next()) == dict:
                for word in vocab:
                    vocab[word] = sum(vocab[word].itervalues())
        except Exception:
            pass
    if vocab is None:
        vocab = data.wv_vocab()
    logdf = wordvec.logdf(vocab)
    logdffile = wordvecf + ".logdf"
    #if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
    #	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
    np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

    if args.comparator == "infsim" and args.sim != "infsim":
        comparator = InfSim(logdf).pairwisedist
    else:
        comparator = args.comparator

    matcher = matcher(df=logdf, metric=comparator)
    data.normalize(matcher, logdf)

    printd("Finding matches...")
    matches = []
    with data.writer(sf) as sw, data.writer(vf) as vw:
        mcnt = 0
        timer = Timer()
        for pair in data.test():
            if sf:
                match = shingle(pair.s1["tokens"], pair.s2["tokens"])
                if match.score >= min_score:
                    sw.write(pair, match)

            if vf:
                printd("Matching pair %s" % (pair.pid), level=1)
                try:
                    sim = matcher.match(pair)
                    matches.append((matcher.tsim, unicode(matcher)))
                except ValueError, err:
                    printd(err)
                    sim = sim_thr
                printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
                if sim < sim_thr:
                    sim = sim_thr
                    start = matcher.start
                    end = matcher.end - matcher.start
                else:
                    start = -1
                    end = len(pair.s2["tokens"]) - 1
                match = Match(sim, start, end)
                vw.write(pair, match)

            mcnt += 1
            if (mcnt % 100000) == 0:
                print >> sys.stderr, "%g tmps" % (100 / timer.mark())
            if limit and mcnt >= limit:
                return

        if conf.verbose:
            for tsim, match in sorted(matches):
                print match
예제 #3
0
def processData(args):
    data = dataset.Dataset.load(args.input_data, args.dataset)
    wvout = args.wvfile
    if os.path.exists(wvout):
        wordvecf = wvout
    else:
        wordvecf = args.wvsource

    features = {x for x in args.basefeatures.split(',') if x != ''}
    matchers = {x for x in args.matchers.split(',') if x != ''}

    printd("Loading Word Vectors")
    wordvec = WordVec(wordvecf)
    printd("Vectorizing")
    data.vectorize(wordvec)
    maxwords = data.maxShortSentence()

    if wvout != wordvecf:
        printd("Rereading word vectors to optimize...")
        wv_toks = data.wv_sentences()
        wordvec = WordVec(wordvecf,
                          sentences=wv_toks,
                          wvout=wvout,
                          size=wordvec.originalsize)
        data.vectorize(wordvec)

    conf.wvsize = wordvec.size

    # Train data
    printd("Computing basic WV Features")
    fs = FeatureSet(data, features)

    if "Pair" in matchers:
        printd("Computing Pair Features")
        matcher = vectorsim.PairFeatures(dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "Shingle" in matchers:
        printd("Computing Shingle Features")
        matcher = Shingler(slop=12, lmbda=0.95)
        fs.addMatcher(matcher)

    vocab = None
    if "MinDistSim" in matchers:
        printd("Computing MinDist")
        vocab = fs.data.wv_vocab()
        data.weight()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos')
        printd("Computing MinDist-Euclidean")
        comparator = 'euclidean'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'euc')

    if "NGram" in matchers:
        printd("Computing MinDist-Ngram")
        vocab = fs.data.wv_vocab()
        if vocab is None:
            vocab = fs.data.wv_vocab()
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=2,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-bigram')
        comparator = 'cosine'
        matcher = vectorsim.MinDistSim(metric=comparator,
                                       df=vocab,
                                       maxsent=maxwords,
                                       ngram=3,
                                       dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher, 'cos-trigram')

    if "WWSim" in matchers:
        printd("Computing WWSim")
        matcher = vectorsim.WWSim(wordvec=wordvec,
                                  dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    if "InfRankSim" in matchers:
        printd("Computing InfRankSim")
        matcher = vectorsim.InfRankSim(data=data,
                                       wordvec=wordvec,
                                       dimfeatures=args.dimfeatures)
        printd("InfRankSim Matching")
        fs.addMatcher(matcher)

    if "InfSim" in matchers:
        # We normalize after so primary features are raw word vectors
        # InfSim
        printd("Computing InfSim")
        wordvec.normalize()
        data.vectorize(wordvec)
        matcher = vectorsim.InfSim(data=data,
                                   wordvec=wordvec,
                                   dimfeatures=args.dimfeatures)
        fs.addMatcher(matcher)

    return fs
예제 #4
0
def main(args):
	global wordvec, wordvecf
	conf.debug = args.debug or args.verbose
	conf.verbose = args.verbose
	conf.args = args
	#nf = args.nuggets
	#uf = args.updates
	#mf = args.matches
	sf = args.shingles
	vf = args.wordvec
	#ef = args.evalfile
	wvout = args.wvfile
	sim_thr = args.sim_thr
	dset = args.dataset
	limit = args.limit

	#if args.dataset == "auto":
	#	if ef is not None:
	#		dset = "semeval"
	#	else:
	#		with open(glob.glob(nf)[0]) as nh:
	#			nfhead = nh.readline()
	#			if nfhead.startswith("query_id\tnugget_id"):
	#				dset = "ts"
	#			elif nfhead.startswith("query_id\tvs_id"):
	#				dset = "mclick"
	#			else:
	#				dset = "1click"

	if os.path.exists(wvout) and not args.force:
		wordvecf = wvout

	if vf:
		printd("Reading word vector...")
		#wordvec = load_wordvec()
		wordvec = WordVec(wordvecf)

	if args.sim == "minsim":
		matcher = MinDistSim
	elif args.sim == "infsim":
		matcher = InfSim
	else:
		matcher = VecSim

	if args.sim == "infsim" or args.comparator == "infsim":
		wordvec.normalize()

	#if dset == "ts":
	#	nuggfn = Nuggets
	#	updfn = Updates
	#	outfn = MatchWriter
	#elif dset == "1click":
	#	nuggfn = CLNuggets
	#	updfn = CLUpdates
	#	outfn = CLMatchWriter
	#elif dset == "mclick":
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter
	#elif dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	nuggfn = MCNuggets
	#	updfn = Updates
	#	outfn = MCMatchWriter

	data = Dataset.load(args.input_data, dset)
	if vf is not None:
		data.vectorize(wordvec)
	#if dset == "semeval":
	#	data = SemEvalDataset(args.input_data, args.evalfile)
	#	#outfn = data.writer
	#	if vf is not None:
	#		data.vectorize(wordvec)
	#else:
	#	printd("Processing Nuggets...")
	#	#nuggets = nuggfn(nf, vectorize=vf is not None)

	#	printd("Processing Updates...")
	#	#updates = updfn(uf, vectorize=vf is not None)
	#	#data = NuggetDataset(nuggets, updates, mf)
	#	data = NuggetDataset(nf, uf, mf, dset=dset, vectorize=vf is not None)

	if vf and wvout is not None and wvout != wordvecf:
		printd("Rereading word vectors to optimize...")
		wv_toks = data.wv_sentences()
		#if dset == "semeval":
		#	wv_toks = data.wv_sentences()
		#else:
		#	wv_toks = nuggets.wv_text() + updates.wv_text()
		wordvec = WordVec(wordvecf, sentences=wv_toks, wvout=wvout, size=wordvec.originalsize)
		if args.sim == "infsim" or args.comparator == "infsim":
			wordvec.normalize()
		data.vectorize(wordvec)
		with open(wvout + ".vocab", 'w') as wh:
			wh.write("\n".join(wordvec.vocab().keys()))
		with open(wvout + ".toks", 'w') as wh:
			wh.write("\n".join([" ".join(x) for x in wv_toks]))
		#vocab = nuggets.wv_vocab().union(updates.wv_vocab())
		#wordvec.trim(lambda word, count, min_count: gensim.utils.RULE_KEEP if word in vocab else gensim.utils.RULE_DISCARD)
		#wordvec.save(wvout)

	vocab = None
	if args.frequencies:
		try:
			with open(args.frequencies) as fh:
				vocab = json.load(fh)
			# For Term Frequencies instead of Document Frequencies
			# Could also do len(vocab[word]) if wanted to mimic DF
			if type(vocab.itervalues().next()) == dict:
				for word in vocab:
					vocab[word] = sum(vocab[word].itervalues())
		except Exception:
			pass
	if vocab is None:
		vocab = data.wv_vocab()
	logdf = wordvec.logdf(vocab)
	logdffile = wordvecf + ".logdf"
	#if not os.path.exists(logdffile) or (os.path.getmtime(logdffile) < os.path.getmtime(wordvecf)):
	#	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")
	np.savetxt(logdffile, logdf, delimiter=" ", fmt="%g")

	if args.comparator == "infsim" and args.sim != "infsim":
		comparator = InfSim(logdf).pairwisedist
	else:
		comparator = args.comparator

	matcher = matcher(df=logdf, metric=comparator)
	data.normalize(matcher, logdf)

	printd("Finding matches...")
	matches = []
	with data.writer(sf) as sw, data.writer(vf) as vw:
		mcnt = 0
		timer = Timer()
		for pair in data.test():
			if sf:
				match = shingle(pair.s1["tokens"], pair.s2["tokens"])
				if match.score >= min_score:
					sw.write(pair, match)

			if vf:
				printd("Matching pair %s" % (pair.pid), level=1)
				try:
					sim = matcher.match(pair)
					matches.append((matcher.tsim, unicode(matcher)))
				except ValueError, err:
					printd(err)
					sim = sim_thr
				printd("Match %0.4f for %s, %s" % (sim, pair.sid1, pair.sid2))
				if sim < sim_thr:
					sim = sim_thr
					start = matcher.start
					end = matcher.end - matcher.start
				else:
					start = -1
					end = len(pair.s2["tokens"]) - 1
				match = Match(sim, start, end)
				vw.write(pair, match)

			mcnt += 1
			if (mcnt % 100000) == 0:
				print >>sys.stderr, "%g tmps" % (100 / timer.mark())
			if limit and mcnt >= limit:
				return

		if conf.verbose:
			for tsim, match in sorted(matches):
				print match