def writePosNeg(): testObjs = objInstances.keys() descObjs = util.getDocsForTest(testObjs) # descObjs = util.getDocuments() objTokens = util.sentenceToWordDicts(descObjs) tknsGlobal = set() posTokens = {} negSampleTokens = {} mostImpTokens = {} if len(objTokens.keys()) == 0: return 1 for (key, value) in objTokens.items(): cValue = Counter(value) mostImpTokens[key] = [] for (k1, v1) in cValue.items(): if v1 > 10: mostImpTokens[key].append(k1) if v1 > 10: if k1 in meaningfulWords: tknsGlobal.add(k1) if k1 in posTokens.keys(): kk1 = posTokens[k1] kk1.append(key) posTokens[k1] = kk1 else: posTokens[k1] = [key] posTokens = collections.OrderedDict(sorted(posTokens.items())) f = open(fld + "/" + posName, "w") title = "token,objects\n" f.write(title) for k, v in posTokens.items(): ll = str(k) + "," ll += "-".join(v) ll += "\n" f.write(ll) f.close() for kTkn in posTokens.keys(): negSampleTokens[kTkn] = [] for (key, value) in objTokens.items(): if kTkn not in value: negSampleTokens[kTkn].append(key) negTokens = {} negsD = util.doc2Vec(descObjs) for kTkn in posTokens.keys(): negTokens[kTkn] = negSampleTokens[kTkn] posV = posTokens[kTkn] for v in posV: negDocVec = negsD[v] negTokens[kTkn] = list( set(negTokens[kTkn]).intersection(set(negDocVec))) negTokens = collections.OrderedDict(sorted(negTokens.items())) f = open(fld + "/" + negName, "w") f.write(title) for k, v in negTokens.items(): ll = str(k) + "," ll += "-".join(v) ll += "\n" f.write(ll) f.close() kWord = ["rgb", "shape", "object"] for wd in kWord: f = open(fld + "/" + wd + posName, "w") f1 = open(fld + "/" + wd + negName, "w") sWords = [] f.write(title) f1.write(title) if wd == "rgb": sWords = rgbWords elif wd == "shape": sWords = shapeWords elif wd == "object": sWords = objWords for k, v in posTokens.items(): if k in sWords: if len(v) > 0: ll = str(k) + "," ll += "-".join(v) ll += "\n" f.write(ll) v = negTokens[k] if len(v) > 0: ll = str(k) + "," ll += "-".join(v) ll += "\n" f1.write(ll) f.close() f1.close() return 0
def writePosNeg(): testObjs = objInstances.keys() descObjs = util.getDocsForTest(testObjs) # descObjs = util.getDocuments() objTokens = util.sentenceToWordDicts(descObjs) tknsGlobal = set() posTokens = {} negSampleTokens = {} mostImpTokens = {} for (key, value) in objTokens.items(): cValue = Counter(value) mostImpTokens[key] = [] for (k1, v1) in cValue.items(): if v1 > 10: mostImpTokens[key].append(k1) if v1 > 10: if k1 in meaningfulWords: tknsGlobal.add(k1) if key in posTokens.keys(): kk1 = posTokens[key] kk1.append(k1) posTokens[key] = kk1 else: posTokens[key] = [k1] posTokens = collections.OrderedDict(sorted(posTokens.items())) f = open(fld + "/" + posName, "w") title = "object,tokens\n" f.write(title) for k, v in posTokens.items(): ll = str(k) + "," ll += "-".join(v) ll += "\n" f.write(ll) f.close() kWord = ["rgb", "shape", "object"] for wd in kWord: f = open(fld + "/" + wd + posName, "w") sWords = [] f.write(title) if wd == "rgb": sWords = rgbWords elif wd == "shape": sWords = shapeWords elif wd == "object": sWords = objWords for k, v in posTokens.items(): vv = [] for v1 in v: if v1 in sWords: vv.append(v1) if len(vv) > 0: ll = str(k) + "," ll += "-".join(vv) ll += "\n" f.write(ll) f.close()
numerator = sum(a*b for a,b in zip(x,y)) denominator = square_rooted(x)*square_rooted(y) return round(numerator/float(denominator),3) oNames = util.objectNames() objNames = collections.OrderedDict(sorted(oNames.items())) docs = util.getDocuments() docLabels = [] docNames = docs.keys() for key in docs.keys(): ar = key.split("/") docLabels.append(ar[1]) docLists = util.sentenceToWordLists(docs) docDicts = util.sentenceToWordDicts(docs) sentences = LabeledLineSentence(docLists,docLabels) model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8) model.build_vocab(sentences.to_array()) token_count = sum([len(sentence) for sentence in sentences]) for epoch in range(10): model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter) model.alpha -= 0.002 # decrease the learning rate model.min_alpha = model.alpha # fix the learning rate, no deca model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter) tfidfLists = util.findtfIDFLists(docLists) topTFIDFWordLists = util.findTopNtfidfterms(docLists,tfidfLists,N) #model.most_similar('arch')