예제 #1
0
def writePosNeg():
    testObjs = objInstances.keys()
    descObjs = util.getDocsForTest(testObjs)
    # descObjs = util.getDocuments()
    objTokens = util.sentenceToWordDicts(descObjs)
    tknsGlobal = set()
    posTokens = {}
    negSampleTokens = {}
    mostImpTokens = {}
    if len(objTokens.keys()) == 0:
        return 1
    for (key, value) in objTokens.items():
        cValue = Counter(value)
        mostImpTokens[key] = []
        for (k1, v1) in cValue.items():
            if v1 > 10:
                mostImpTokens[key].append(k1)
            if v1 > 10:
                if k1 in meaningfulWords:
                    tknsGlobal.add(k1)
                    if k1 in posTokens.keys():
                        kk1 = posTokens[k1]
                        kk1.append(key)
                        posTokens[k1] = kk1
                    else:
                        posTokens[k1] = [key]
    posTokens = collections.OrderedDict(sorted(posTokens.items()))
    f = open(fld + "/" + posName, "w")
    title = "token,objects\n"
    f.write(title)
    for k, v in posTokens.items():
        ll = str(k) + ","
        ll += "-".join(v)
        ll += "\n"
        f.write(ll)
    f.close()
    for kTkn in posTokens.keys():
        negSampleTokens[kTkn] = []
        for (key, value) in objTokens.items():
            if kTkn not in value:
                negSampleTokens[kTkn].append(key)
    negTokens = {}
    negsD = util.doc2Vec(descObjs)
    for kTkn in posTokens.keys():
        negTokens[kTkn] = negSampleTokens[kTkn]
        posV = posTokens[kTkn]
        for v in posV:
            negDocVec = negsD[v]
            negTokens[kTkn] = list(
                set(negTokens[kTkn]).intersection(set(negDocVec)))

    negTokens = collections.OrderedDict(sorted(negTokens.items()))
    f = open(fld + "/" + negName, "w")
    f.write(title)
    for k, v in negTokens.items():
        ll = str(k) + ","
        ll += "-".join(v)
        ll += "\n"
        f.write(ll)
    f.close()

    kWord = ["rgb", "shape", "object"]
    for wd in kWord:
        f = open(fld + "/" + wd + posName, "w")
        f1 = open(fld + "/" + wd + negName, "w")
        sWords = []
        f.write(title)
        f1.write(title)
        if wd == "rgb":
            sWords = rgbWords
        elif wd == "shape":
            sWords = shapeWords
        elif wd == "object":
            sWords = objWords
        for k, v in posTokens.items():
            if k in sWords:
                if len(v) > 0:
                    ll = str(k) + ","
                    ll += "-".join(v)
                    ll += "\n"
                    f.write(ll)
                v = negTokens[k]
                if len(v) > 0:
                    ll = str(k) + ","
                    ll += "-".join(v)
                    ll += "\n"
                    f1.write(ll)
        f.close()
        f1.close()
    return 0
def writePosNeg():
    testObjs = objInstances.keys()
    descObjs = util.getDocsForTest(testObjs)
    # descObjs = util.getDocuments()
    objTokens = util.sentenceToWordDicts(descObjs)
    tknsGlobal = set()
    posTokens = {}
    negSampleTokens = {}

    mostImpTokens = {}
    for (key, value) in objTokens.items():
        cValue = Counter(value)
        mostImpTokens[key] = []
        for (k1, v1) in cValue.items():
            if v1 > 10:
                mostImpTokens[key].append(k1)
            if v1 > 10:
                if k1 in meaningfulWords:
                    tknsGlobal.add(k1)
                    if key in posTokens.keys():
                        kk1 = posTokens[key]
                        kk1.append(k1)
                        posTokens[key] = kk1
                    else:
                        posTokens[key] = [k1]
    posTokens = collections.OrderedDict(sorted(posTokens.items()))
    f = open(fld + "/" + posName, "w")
    title = "object,tokens\n"
    f.write(title)

    for k, v in posTokens.items():
        ll = str(k) + ","
        ll += "-".join(v)
        ll += "\n"
        f.write(ll)
    f.close()

    kWord = ["rgb", "shape", "object"]
    for wd in kWord:
        f = open(fld + "/" + wd + posName, "w")

        sWords = []
        f.write(title)

        if wd == "rgb":
            sWords = rgbWords
        elif wd == "shape":
            sWords = shapeWords
        elif wd == "object":
            sWords = objWords
        for k, v in posTokens.items():
            vv = []
            for v1 in v:
                if v1 in sWords:
                    vv.append(v1)
            if len(vv) > 0:
                ll = str(k) + ","
                ll += "-".join(vv)
                ll += "\n"
                f.write(ll)

        f.close()
예제 #3
0
   numerator = sum(a*b for a,b in zip(x,y))
   denominator = square_rooted(x)*square_rooted(y)
   return round(numerator/float(denominator),3)
 

oNames = util.objectNames()
objNames = collections.OrderedDict(sorted(oNames.items()))

docs = util.getDocuments()
docLabels = []
docNames = docs.keys()
for key in docs.keys():
   ar = key.split("/")
   docLabels.append(ar[1])
docLists = util.sentenceToWordLists(docs)
docDicts = util.sentenceToWordDicts(docs)
sentences = LabeledLineSentence(docLists,docLabels)
model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8)

model.build_vocab(sentences.to_array())
token_count = sum([len(sentence) for sentence in sentences])
for epoch in range(10):
    model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca
    model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)

tfidfLists = util.findtfIDFLists(docLists)
topTFIDFWordLists = util.findTopNtfidfterms(docLists,tfidfLists,N)

#model.most_similar('arch')