예제 #1
0
def mergeDailyJob(date):
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
    targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    listCol = srcBbClient.getCollection("daily_job_list_" + date)
    infoCol = srcBbClient.getCollection("daily_job_info_" + date)
    newCol = targetBbClient.getCollection("daily_job_" + date)
    mergeJob(listCol, infoCol, newCol)
예제 #2
0
def filterWebDeveloper_indeed():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     srcCol = srcBbClient.getCollection("daily_job_2014-06-05")  
     newCol = targetBbClient.getCollection("daily_job_webdev")       
     
     for job in srcCol.find():
         jobtitle = job["jobtitle"].lower()
         if (jobtitle.find("web") != -1 ) and \
             (jobtitle.find("developer") != -1 ):
                print jobtitle.encode("GBK", "ignore")                 
                newCol.insert(job) 
예제 #3
0
def testProcessPage():

    listCollectionName = "daily_dice_list_2014-07-11"
    infoCollectionName = "daily_dice_info_2014-07-11"

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    listCollection = dbClient.getCollection(listCollectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getter = DicePageGetter(infoCollection)

    pageSize = 100
    pageNo = 1
    has_more = True
    pageNum = 10000
    find_sort = None
    find_spec = None
    while has_more and pageNo <= pageNum:
        page = dbClient.getPage(listCollection, find_spec, find_sort, pageSize,
                                pageNo)
        getter.processPage(page, pageNo)
        pageNo += 1
        count = page.count(with_limit_and_skip=True)
        #   print "count=",count
        if (count < pageSize):
            has_more = False
예제 #4
0
def main():
    days = 4
    today = datetime.date.today()
    today = "2014-06-15_p4"
    listCollectionName = "daily_job_list_" + str(today)
    print "list collection name:", listCollectionName
    infoCollectionName = "daily_job_info_" + str(today)
    print "info collection name:", infoCollectionName

    #   lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")
    #   cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt")
    lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    cities = jobaly.utils.loadArrayFromFile("loc_list.txt")

    dbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    listCollection = dbClient.getCollection(listCollectionName)

    start_time = time.time()
    print "---- start get job list ----"
    #  getJobList(listCollectionName)
    crawlIndeed(listCollection, lang_names, cities, days)
    t = time.time() - start_time
    print "---- finish get job list, use %s seconds  ----" % t

    print
    print

    infoCollection = dbClient.getCollection(infoCollectionName)
    start_time = time.time()
    print "---- start get job info ----"
    getJobInfo(dbClient, listCollection, infoCollection)
    t = time.time() - start_time
    print "---- finish get job info, use %s seconds  ----" % t
예제 #5
0
def getSentsByOntology():
     owlfile = "..\..\jobaly\ontology\web_dev.owl"
     ontology = OntologyLib(owlfile)
     terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()]
     terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()])
     
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     newCol = srcBbClient.getCollection("daily_job_webdev")     
     collection = newCol
     
     matchingSents = []
     for job in collection.find(): 
      #   print "\n\n\n======",job["_id"],"============================\n"
        jobDesc = JobDescParser.parseJobDesc(job)
        sents = jobDesc.listAllSentences() 
        jid = job["_id"]
        for sent in sents:
            c = 0
            sent = " "+sent.lower()+" "
            for term in terms:                
                if sent.find(term) != -1:
                   c+=1
                if c==3 : 
                    print sent.encode("GBK", "ignore")
                    matchingSents.append((jid, sent))
                    break
              
     sortedsents = sorted(matchingSents, key=lambda x:   len(x[1]) )
     dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )     
예제 #6
0
def aggregateHtmlTag(): 
    listCollectionName = "daily_dice_info_2014-07-11"
 #   listCollectionName = "daily_job_info_2014-07-08"
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName) 
    dataProcessor = JobDataProcessor(collection)
    dataProcessor.aggregateHtmlTags() 
예제 #7
0
def testGetSentenceByTerm(term):
    
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     collection = srcBbClient.getCollection("daily_job_webdev")
     
     outputPath = '..\skill\output\\' + term
     getSentenceByTerm(collection, term, outputPath)
예제 #8
0
def main():
    path = "..\\..\\..\\..\\data\\resumes\\web\\"
    # scandir(path)

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    resumeCollName = "web_resumes"
    collection = srcBbClient.getCollection(resumeCollName)
    saveResumes(path, collection)
예제 #9
0
def aggregateTitle(): 
    listCollectionName = "daily_job_list_2014-06-10"
    listCollectionName = "daily_dice_info_2014-07-11"    
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName) 
    dataProcessor = JobDataProcessor(collection)
    dataProcessor.aggregateTitleToFile("titles//dice_titleList.json")
    dataProcessor.aggregateTitleToFile("titles//dice_titleList.txt", "text")
예제 #10
0
 def connectJobColl(self, dbName, collName):
     
     self.dbname = dbName 
     self.collname = collName                
     self.dbClient = DbClient('localhost', 27017, dbName)               
     self.jobCollection = self.dbClient.getCollection(collName)  
     self.collSize = self.dbClient.getCollectionSize(collName) 
     self.modelCollection = self.dbClient.getCollection(collName+"_model")
예제 #11
0
def filterWebDeveloper_dice():
     srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
     targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
     srcCol = srcBbClient.getCollection("daily_dice_info_2014-07-11")  
     newCol = targetBbClient.getCollection("daily_job_webdev")       
     
     i = 0
     for job in srcCol.find():
         jobtitle = job["jobtitle"].lower()
         if (jobtitle.find("web") != -1 ) and \
             (jobtitle.find("developer") != -1 ):
                 
                job["url"] = job["detailUrl"] 
                job["detailUrl"] = None
                newCol.insert(job) 
                i+=1
                print i, ":", jobtitle.encode("GBK", "ignore")
                if i == 150: 
                    break
예제 #12
0
def main(): 
    
    dbClient = DbClient('localhost', 27017, "jobaly")  
    jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"])  
    tfIdfMatch = TfIdfMatch(jobCollection)
    resume = "I a am good java programmer, PHP, XML, hope juse c++, skill" 
    jobs = tfIdfMatch.matchResume(resume)
    
    for job in jobs:
        print job["_id"], job["score"]
예제 #13
0
def processJobColl():

    srcDb = "jobaly_daily"
    srcCollnames = "daily_job_info_2014-06-16"
    srcDb = "jobaly_daily_test"
    srcCollnames = "daily_job_webdev"
    srcClient = DbClient('localhost', 27017, srcDb)
    srcCollnames = "daily_job_info_2014-06-16"
    srcColl = srcClient.getCollection(srcCollnames)

    targetDb = "jobaly"
    targetCollName = "job100"
    targetClient = DbClient('localhost', 27017, targetDb)

    targetColl = targetClient.getCollection(targetCollName)

    size = 15
    #  copyColl(srcColl,  targetColl, size)
    processjobs(targetDb, targetCollName)
예제 #14
0
 def setup_tfidfMatcher(self):
     if ( self.dbclient is None):
       self.dbClient = DbClient('localhost', 27017, "jobaly")               
     else: 
       self.dbClient = dbclient
       
     self.resumeCollection = self.dbClient.getCollection(gConfig["webResumeColName"]) 
     self.jobCollection = self.dbClient.getCollection(gConfig["webJobInfoCollName"])  
     self.jobModelCollection = self.dbClient.getCollection(gConfig["jobModelCollName"])
     self.matcher = TfIdfMatch(self.jobCollection)
예제 #15
0
def main():

    # print gConfig
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection(gConfig["webJobInfoCollName"])
    jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"])

    tfIdfGetter = TfIdfGetter()
    # tfIdfGetter.saveJobTfIdf(jobCollection,  jobIdfCollection )
    idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection)
    print idf
예제 #16
0
def testProcessQuery():
    today = datetime.date.today()
    listCollectionName = "daily_dice_list_" + str(today)
    listCollectionName = "daily_dice_list_" + "test"

    print listCollectionName
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)
    diceClient = DiceApiClient()
    diceClient.setState("TN")
    print diceClient.processQuery(collection)
예제 #17
0
def testParseAll():

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")

    job = DbClient.findById(newCol, jid)
    #  paragraph = JobParser.parseParagraph(job)

    for job in newCol.find():
        print "\n\n\n======", job["_id"], "============================\n"

        jobDesc = JobDescParser.parseJobDesc(job)
예제 #18
0
def testTermMatching():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    allSents = getAllSentsInColl(newCol)

    term = "experience"
    term = "knowledge"
    term = "skills"
    term = "degree"
    matchingSents = termMatching(allSents, term)
    dumpTwo(matchingSents, "sents\\matching_" + term,
            (lambda x: x[0] + ":" + x[1]))
예제 #19
0
def main():
    targetBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    targetColl = targetBbClient.getCollection("test_coll")

    srcBbClient = DbClient('localhost', 27017, "jobaly_daily")
    srcColl = srcBbClient.getCollection("daily_job_info_2014-06-04")
    #   collutils.copyCollection(srcColl, targetColl)

    srcCollNames = [
        "daily_job_info_2014-06-04", "daily_job_info_2014-06-05",
        "daily_job_info_2014-06-06", "daily_job_info_2014-06-08",
        "daily_job_info_2014-06-10"
    ]
    #  collutils.copyCollections(targetBbClient, "job_info_merge", srcBbClient, srcCollNames)

    srcCollNames = [
        "daily_job_list_2014-06-04", "daily_job_list_2014-06-05",
        "daily_job_list_2014-06-06", "daily_job_list_2014-06-08",
        "daily_job_list_2014-06-10"
    ]
    collutils.copyCollections(targetBbClient, "job_list_merge", srcBbClient,
                              srcCollNames)
예제 #20
0
def loadJobs(dbname, collName, ids):
    dbClient = DbClient('localhost', 27017, dbname)
    jobCollection = dbClient.getCollection(collName)
    jobs = []
    for jobid in ids:
        result = list(jobCollection.find({'_id': jobid}))
        if len(result) > 0:
            job = result[0]
            #   print type(job)
            #   print job
            print job["_id"], job["location"]
            jobs.append(job)
    return jobs
예제 #21
0
def getJobList_sync(listCollectionName):

    print " --- get daily job by language and top cities---"

    lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    cities = jobaly.utils.loadArrayFromFile("loc_list.txt")

    #  lang_names = jobaly.utils.loadArrayFromFile("test_lang_list.txt")
    #  cities = jobaly.utils.loadArrayFromFile("test_loc_list.txt")

    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)
    crawlIndeed(collection, lang_names, cities)
예제 #22
0
def testGetJobInfo():
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    today = datetime.date.today()
    listCollectionName = "daily_dice_list_" + str(today)
    infoCollectionName = "daily_dice_info_" + str(today)

    listCollectionName = "daily_dice_list_2014-07-11"
    infoCollectionName = "daily_dice_info_2014-07-11"

    print listCollectionName
    print infoCollectionName
    listCollection = dbClient.getCollection(listCollectionName)
    infoCollection = dbClient.getCollection(infoCollectionName)
    getJobInfo(dbClient, listCollection, infoCollection)
예제 #23
0
def testParseParagraph():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    jid = "9e216b2d65bd864b"
    jid = "matrixga/78237-51"
    jid = "cybercod/CN-.NETwebDev-CA3"
    jid = "f3c336fa35c28771"
    jid = "10116717/638726"
    jid = "ocs/54391"
    jid = "0e230c368a34322b"
    jid = "6718adb8b28b9b39"
    job = DbClient.findById(newCol, jid)
    jobDesc = JobDescParser.parseJobDesc(job)
    jobDesc.printParagraphs()
예제 #24
0
def main(): 
    #srcJobInfoCollName: jobinfo_lang_top_corps 
    #webJobInfoCollName: test_jobinfo
    #webResumeColName: test_resume
    #JobIdfCollName:job_idf
  # print gConfig
  dbClient = DbClient('localhost', 27017, "jobaly")  
  jobCollection = dbClient.getCollection("test_jobinfo")  
 # jobIdfCollection = dbClient.getCollection(gConfig["JobIdfCollName"]) 
  
  tfIdfGetter = TfIdfGetter()
 # tfIdfGetter.saveJobTfIdf(jobCollection,  jobIdfCollection )
  idf, jobs = tfIdfGetter.getJobTfIdf(jobCollection)
  print idf
예제 #25
0
def getJobList(listCollectionName):

    print " --- get daily job by language and top cities---"

    # lang_names = jobaly.utils.loadArrayFromFile("lang_list.txt")
    states = jobaly.utils.loadArrayFromFile("state_list.txt")

    diceClient = DiceApiClient({"age": "1"})
    dbClient = DbClient('localhost', 27017, "jobaly_daily")
    collection = dbClient.getCollection(listCollectionName)

    for state in states:
        diceClient.setState(state)
        print "-----prcoss location %s  -------" % (state)
        diceClient.processQuery(collection)
예제 #26
0
def processResumes():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    resumeCollName = "web_resumes"

    resumemodelCollName = resumeCollName + "_model"
    resumeColl = srcBbClient.getCollection(resumeCollName)
    modelColl = srcBbClient.getCollection(resumemodelCollName)
    #  newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")

    #  for resume in collection.find():
    resume = resumeColl.find_one()
    resumeModel = parseResume(resume)
    #   modelColl.save(resumeModel.serialize())

    saveResumeModels(resumeColl, modelColl)
예제 #27
0
def main():
    #webJobInfoCollName: test_jobinfo
    resume = loadResume(
        "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
    #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
    #  resume =  loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")

    #  print resume
    #  resume = "I a am good java programmer, PHP, XML, hope juse c++, skill"
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection("job100")
    kl = KL(jobCollection)
    jobs = kl.matchResume(resume)

    for job in jobs:
        print job["_id"], job["score"]
예제 #28
0
def testTermsMatching():
    srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test")
    newCol = srcBbClient.getCollection("daily_job_webdev")
    newCol = srcBbClient.getCollection("daily_job_info_2014-06-16")
    newCol = srcBbClient.getCollection("daily_job_info_2014-06-10")

    print "newCol=", newCol
    allSents = getAllSentsInColl(newCol)

    terms = [
        "degree", "B.S.", "M.S.", "BS", "MS", "bachelor", "master", "phd",
        "master's"
    ]
    matchingSents = termsMatching(allSents, terms)
    #  dumpTwo(matchingSents, "sents\\degree_raw" , ( lambda x: x[0] + ":" + x[1] ) )
    dumpTwo(matchingSents, "sents\\degree_0610", (lambda x: x[0] + ":" + x[1]))
예제 #29
0
def main():
    #webJobInfoCollName: test_jobinfo
    resumepath = ""
    resume = loadResume(
        "..\\..\\..\\data\\test_resumes\\Darin-Densley_web.txt")
    resume = loadResume("..\\..\\..\\data\\test_resumes\\Java-Developer.txt")
    resume = loadResume("..\\..\\..\\data\\test_resumes\\Fong-Kuo_data.txt")

    # print resume
    dbClient = DbClient('localhost', 27017, "jobaly")
    jobCollection = dbClient.getCollection("job100")
    tfIdfMatch = TfIdfMatch(jobCollection)

    jobs = tfIdfMatch.matchResume(resume)

    for job in jobs:
        print job["_id"], job["score"]
예제 #30
0
def getOntology(resumefile, dbname, modelCollName):
    dbClient = DbClient('localhost', 27017, dbname)
    modelColl = dbClient.getCollection(modelCollName)

    with open(resumefile, 'r') as content_file:
        content = content_file.read()
        content = remove_non_ascii_2(content)
    resumeModel = resumeparser.parseResumeText(content)
    # print     resumeModel
    similarity = ModelSimilarity()
    result = similarity.match_jobColl(resumeModel, modelColl)
    n = 1
    for key, value in result[:20]:
        print n, key, value
        n = n + 1
    print "- - - - - - -"
    for key, value in result[:20]:
        print key