Пример #1
0
def getSentsByOntology():
     owlfile = "..\..\jobaly\ontology\web_dev.owl"
     ontology = OntologyLib(owlfile)
     terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()]
     terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()])
     
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")     
     collection = newCol
     
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            c = 0
            sent = " "+sent.lower()+" "
            for term in terms:                
                if sent.find(term) != -1:
                   c+=1
                if c==3 : 
                    print sent.encode("GBK", "ignore")
                    matchingSents.append((jid, sent))
                    break
              
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )     
Пример #2
0
def preProcess(data_set_name, target_set_name):
    
    max_length = 200
    data = datautils.loadJson(data_set_name)    
    newdata = []
    for item in data:
        if len (item[1] ) < max_length : 
            item.append ( preProcessFun(item[1]) )
            item[1] = len(item[2].split())
            newdata.append(item)
    newdata = sorted(newdata, key=operator.itemgetter(1) )
    datautils.dumpTwo(newdata, target_set_name, dumpLam2)    
Пример #3
0
def preProcess(data_set_name, target_set_name):

    max_length = 200
    data = datautils.loadJson(data_set_name)
    newdata = []
    for item in data:
        if len(item[1]) < max_length:
            item.append(preProcessFun(item[1]))
            item[1] = len(item[2].split())
            newdata.append(item)
    newdata = sorted(newdata, key=operator.itemgetter(1))
    datautils.dumpTwo(newdata, target_set_name, dumpLam2)
Пример #4
0
def getSentenceByTerm(collection, term, outputPath):
    
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, outputPath , ( lambda x: x[0] + ":" + x[1] ) )     
Пример #5
0
def getJavaScipt(): 
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")
     
     collection = newCol
     term = "javascript"
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            tokens = [ token.lower() for token in word_tokenize(sent)]              
            if term in tokens : 
                matchingSents.append((jid, sent))
                print sent.encode("GBK", "ignore")
                
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )