def makeVectorsFromFiles(pathname, outdir, **params): v = Vectors(**params) for docName in os.listdir(pathname): #print "Adding document " + docName docFile = open(pathname+docName,'r') docText = docFile.read() if len(docText) > 0: v.addText(docText,docName) docFile.close() if not os.path.exists(outdir): os.makedirs(outdir) for docName in v.docs: outfile = open(outdir+docName,'w') print >>outfile, v.vectorString(v.docs[docName]) outfile.close() v.saveFeatures()
import sys from text2vec import Vectors # USAGE: # python prepareSVMFeatures.py `ls -d op_spam_v1.3/all/*` userInput = sys.argv[1:] v = Vectors() for docName in userInput: #print "Adding document " + docName docText = [s for s in open(docName)] if len(docText) > 0: docText = docText[0] v.addDoc(docName,docText) v.saveFeatures() for docName in v.docs.keys(): s = v.vectorString(v.docs[docName]) if "/d_" in s: # GENERALIZE THIS TO A REGEX? print "+1 " + s if "/t_" in s: print "-1 " + s