Пример #1
0
def getSentsByOntology():
     owlfile = "..\..\jobaly\ontology\web_dev.owl"
     ontology = OntologyLib(owlfile)
     terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()]
     terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()])
     
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")     
     collection = newCol
     
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            c = 0
            sent = " "+sent.lower()+" "
            for term in terms:                
                if sent.find(term) != -1:
                   c+=1
                if c==3 : 
                    print sent.encode("GBK", "ignore")
                    matchingSents.append((jid, sent))
                    break
              
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )     
Пример #2
0
def getDisMatrixFromColletion(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     f = open('sents.txt','w')
      # python will convert \n to os.linesep
     
     docs = []
     for job in collection.find(): 
      #  print "\n\n\n======",job["_id"],"============================\n"
     #   f.write(job["summary"].encode("GBK", "ignore")+"\n")
        jobDesc = JobDescParser.parseJobDesc(job)
        
        sents = jobDesc.listAllSentences() 
        doc =[]
        for sent in sents:
           # print sent.encode("GBK", "ignore")
            f.write(sent.encode("GBK", "ignore")+"\n")
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            for token in tokens:
                if token == 'c':
                #    print token
                    pass
            doc.extend(tokens)        
        docs.append(doc)
     f.close()
     terms=["javascript", "jquery", "html", "css", "java", "python", "ruby", "mysql", "jdbc" , "cpp"  ]
  #   terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror"  ]
  

   # terms=["java","jdbc","spring","hibernate","mysql","oracle"]
     matrix = getDistanceMatrix(docs, terms)   
     printDisMatrix(terms, matrix)   
     matrix_dump = json.dumps(matrix)
     print matrix_dump
Пример #3
0
def getAllSentsInColl(collection):
    allSents = []
    for job in collection.find(): 
        print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = [ (  jobDesc._id, sent )  for sent in  jobDesc.listAllSentences() ]
        allSents.extend(sents)  
    return allSents    
Пример #4
0
def preprocess( job ):
    jobDesc = JobDescParser.parseJobDesc(job)    
    sents = jobDesc.listAllSentences() 
    sents2 = []
    for line in sents:     
        sents2.append( processLine(line)  )
    
    return sents2
Пример #5
0
def getAllSentsInColl(collection):
    allSents = []
    for job in collection.find():
        print "\n\n\n======", job["_id"], "============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = [(jobDesc._id, sent) for sent in jobDesc.listAllSentences()]
        allSents.extend(sents)
    return allSents
Пример #6
0
def getSentenceByTerm(collection, term, outputPath):
    
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, outputPath , ( lambda x: x[0] + ":" + x[1] ) )     
Пример #7
0
def getJavaScipt(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")
     
     collection = newCol
     term = "javascript"
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )     
Пример #8
0
def createDocs():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     maxnum =99999
     docs = []
     i= 0
     for job in collection.find(): 
        i+=1
        if i == maxnum: 
             break
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
       
        doc =[]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            doc.extend(tokens)      
        docs.append(doc)   
    
     return docs
Пример #9
0
def getDisMatrixFromColletion():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    collection = srcBbClient.getCollection("daily_job_webdev")
    f = open('sents.txt', 'w')
    # python will convert \n to os.linesep

    docs = []
    for job in collection.find():
        #  print "\n\n\n======",job["_id"],"============================\n"
        #   f.write(job["summary"].encode("GBK", "ignore")+"\n")
        jobDesc = JobDescParser.parseJobDesc(job)

        sents = jobDesc.listAllSentences()
        doc = []
        for sent in sents:
            # print sent.encode("GBK", "ignore")
            f.write(sent.encode("GBK", "ignore") + "\n")
            tokens = [token.lower() for token in word_tokenize(sent)]
            for token in tokens:
                if token == 'c':
                    #    print token
                    pass
            doc.extend(tokens)
        docs.append(doc)
    f.close()
    terms = [
        "javascript", "jquery", "html", "css", "java", "python", "ruby",
        "mysql", "jdbc", "cpp"
    ]
    #   terms=["javascript", "jquery", "html", "css", "java", "jsp", "python", "ruby", "ror"  ]

    # terms=["java","jdbc","spring","hibernate","mysql","oracle"]
    matrix = getDistanceMatrix(docs, terms)
    printDisMatrix(terms, matrix)
    matrix_dump = json.dumps(matrix)
    print matrix_dump