예제 #1
0
def makeVectorsFromFiles(pathname, outdir, **params):
    v = Vectors(**params)
    for docName in os.listdir(pathname):
    #print "Adding document " + docName
        docFile = open(pathname+docName,'r')
        docText = docFile.read()
        if len(docText) > 0:
            v.addText(docText,docName)
        docFile.close()
            
    if not os.path.exists(outdir):
        os.makedirs(outdir)
                
    for docName in v.docs:
        outfile = open(outdir+docName,'w')
        print >>outfile, v.vectorString(v.docs[docName])
        outfile.close()

    v.saveFeatures()
import sys
from text2vec import Vectors

# USAGE:
# python prepareSVMFeatures.py `ls -d op_spam_v1.3/all/*`

userInput = sys.argv[1:]
v = Vectors()
for docName in userInput:
    #print "Adding document " + docName
    docText = [s for s in open(docName)]
    if len(docText) > 0:
        docText = docText[0]
        v.addDoc(docName,docText)
v.saveFeatures()

for docName in v.docs.keys():
    s = v.vectorString(v.docs[docName])
    if "/d_" in s: # GENERALIZE THIS TO A REGEX?
        print "+1 " + s
    if "/t_" in s:
        print "-1 " + s