def getSentsByOntology(): owlfile = "..\..\jobaly\ontology\web_dev.owl" ontology = OntologyLib(owlfile) terms = [ " "+ x.lower()+" " for x in ontology.getLabelList()] terms.extend([" "+x.lower()+" " for x in ontology.getAllClassNames()]) srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: c = 0 sent = " "+sent.lower()+" " for term in terms: if sent.find(term) != -1: c+=1 if c==3 : print sent.encode("GBK", "ignore") matchingSents.append((jid, sent)) break sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "term3" , ( lambda x: x[0] + ":" + x[1] ) )
def preProcess(data_set_name, target_set_name): max_length = 200 data = datautils.loadJson(data_set_name) newdata = [] for item in data: if len (item[1] ) < max_length : item.append ( preProcessFun(item[1]) ) item[1] = len(item[2].split()) newdata.append(item) newdata = sorted(newdata, key=operator.itemgetter(1) ) datautils.dumpTwo(newdata, target_set_name, dumpLam2)
def preProcess(data_set_name, target_set_name): max_length = 200 data = datautils.loadJson(data_set_name) newdata = [] for item in data: if len(item[1]) < max_length: item.append(preProcessFun(item[1])) item[1] = len(item[2].split()) newdata.append(item) newdata = sorted(newdata, key=operator.itemgetter(1)) datautils.dumpTwo(newdata, target_set_name, dumpLam2)
def getSentenceByTerm(collection, term, outputPath): matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] if term in tokens : matchingSents.append((jid, sent)) print sent.encode("GBK", "ignore") sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, outputPath , ( lambda x: x[0] + ":" + x[1] ) )
def getJavaScipt(): srcBbClient = DbClient('localhost', 27017, "jobaly_daily_test") newCol = srcBbClient.getCollection("daily_job_webdev") collection = newCol term = "javascript" matchingSents = [] for job in collection.find(): # print "\n\n\n======",job["_id"],"============================\n" jobDesc = JobDescParser.parseJobDesc(job) sents = jobDesc.listAllSentences() jid = job["_id"] for sent in sents: tokens = [ token.lower() for token in word_tokenize(sent)] if term in tokens : matchingSents.append((jid, sent)) print sent.encode("GBK", "ignore") sortedsents = sorted(matchingSents, key=lambda x: len(x[1]) ) dumpTwo(sortedsents, "..\skill\output\javascript" , ( lambda x: x[0] + ":" + x[1] ) )