start = time.time() try: conn = utils.persist.connection() cur = conn.cursor() sql = 'select description, position_name from company where type != "None"' cur.execute(sql) rst = cur.fetchall() transformer = TfidfTransformer() vectorizer = CountVectorizer() stopf = open('stopword', 'r') stopwords = pickle.load(stopf) train_data = [] for rs in rst: desc = utils.discrement_unicode(rs[0]) segs = jieba.cut(desc, cut_all=False) wordss = [] for seg in segs: if not stopwords.has_key(seg.lower()): wordss.append(seg.lower()) segs = jieba.cut(utils.discrement_unicode(rs[1]), cut_all=False) for seg in segs: if not stopwords.has_key(seg.lower()): for i in range(5): wordss.append(seg.lower()) train_data.append(' '.join(wordss)) tfidf = transformer.fit_transform(vectorizer.fit_transform(train_data))
try: conn = utils.persist.connection() cur = conn.cursor() resume_id = '12227125-2' sql = 'select resumekeywords, dessalary, latestmajor, latestcollege,\ latestdegree, workyear, latestcompany, latesttitle from profile where \ resume_id = "%s"' % resume_id cur.execute(sql) rst = cur.fetchall() for rs in rst: for tem in rs: print utils.discrement_unicode(tem) sql = 'select unit_name, start_time, end_time, position_name, description from work where\ resume_id = "%s"' % resume_id cur.execute(sql) rst = cur.fetchall() for rs in rst: for tem in rs: print utils.discrement_unicode(tem) conn.close() except: traceback.print_exc() conn.close()
topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} num = 0 for rs in rst: if not type_dct.has_key(rs[1].lower()): type_dct[rs[1].lower()] = num type_dct_r[num] = rs[1].lower() num += 1 for rs in rst: desc = rs[0] segs = jieba.cut(utils.discrement_unicode(desc), cut_all=False) tpwords = copy.deepcopy(topic_words) for seg in segs: if tpwords.has_key(seg.lower()): tpwords[seg.lower()] += 1 tpwordl = [] topicsy.append(type_dct[rs[1].lower()]) for key in sorted(tpwords.keys()): tpwordl.append(tpwords[key]) topicsx.append(tpwordl) pdb.set_trace() x = np.array(topicsx)
cur.execute(sql) stopf = open('stopword', 'rb') stopword = pickle.load(stopf) topics = [] topic_words = {} rst = cur.fetchall() for rs in rst: desc = rs[0] pname = rs[1] ptype = rs[2] topics.append(ptype) topic_word = [] segs = jieba.cut(utils.discrement_unicode(desc)) for seg in segs: if stopword.has_key(seg): continue if not topic_words.has_key(seg.lower()): topic_words[seg.lower()] = 0 pdb.set_trace() # print topic_words for word in topic_words: print word topic_word = open('topic_word', 'wb') pickle.dump(topic_words, topic_word) except: traceback.print_exc()
cur.execute(sql) rst = cur.fetchall() # pdb.set_trace() for rs in rst: print rs[2] if rs[1] == '[]' or not rs[1]: continue # rsff3 = rs[1].replace(u'\u201c', '"') # rsff2 = rsff3.replace('\n', '') # rsff3 = utils.discrement_unicode(rs[1]) # pdb.set_trace() rsff3 = rs[1].replace('\n', '') rsff3 = rsff3.replace(u'\u201c', '"') rsff3 = rsff3.replace(u'\u2018', '') rsff3 = utils.discrement_unicode(rsff3) rsff2 = utils.convert_code(rsff3) rsff = eval(rsff2) # pdb.set_trace() for rsf in rsff: for key in rsf.keys(): rsf[key.decode('utf8')] = rsf[key].decode('utf8') # rsf.pop(key) for rsf in rsff: sql = 'insert into projects(name, start, end, description, resume_id, software,\ hardware, developtool, dudescription) values("%s", "%s", "%s", "%s", "%s", "%s",\ "%s", "%s", "%s")' % (rsf.get('name', ''), rsf.get('start_time', ''), rsf.get('end_time', ''),\ rsf.get(u'项目描述', ''), rs[0], rsf.get(u'软件环境', ''), rsf.get(u'硬件环境', ''), \ rsf.get(u'开发工具', ''), rsf.get(u'责任描述', '')) cur.execute(sql)
rst = cur.fetchall() # topic_word = open('topic_word', 'r') topic_word = open('tfidfwords', 'r') topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} train_data = [] train_tags = [] for rs in rst: desc = rs[0] segs = jieba.cut(utils.discrement_unicode(desc).lower(), cut_all=False) tpword = copy.deepcopy(topic_words) train_words = [] for seg in segs: if tpword.has_key(seg.lower()): tpword[seg.lower()] += 1 segs = jieba.cut(utils.discrement_unicode(rs[2]).lower(), cut_all=False) for seg in segs: if tpword.has_key(seg.lower()): tpword[seg.lower()] += 1 # pdb.set_trace() # for key in sorted(tpword.keys()): for key in tpword.keys(): train_words.append(tpword[key])
def generate_test_feature(cur, pos_id, resume_id): sql = 'select position_id, low_income, description, low_workage, position_name,\ workage, degree from companytest where position_id = %d' % pos_id cur.execute(sql) rst = cur.fetchall() for term in rst: com_low_income = term[1] pro_low_income = 0 com_position = utils.discrement_unicode(term[4]) com_workage = term[5] com_degree = term[6] pro_position = '' com_description = utils.discrement_unicode(term[2]) pro_decription = '' pro_hisprojects = '' pro_otherinfo = '' com_lst = [] if com_low_income < 500 and com_low_income > 0: com_low_income = (com_low_income * 8000) / 12 com_low_income = com_low_income / 5000 low_workage = term[5] if not low_workage: low_workage = 0 com_lst.append(low_workage) com_workage = term[5] com_degree = term[6] keywords = get_keywords(utils.discrement_unicode(term[2])) com_description = utils.discrement_unicode(term[2]) try: sqlp = 'select dessalary, skills, destitle, hisprojects, otherinfo, resume_id, workyear, latestdegree \ from profiletest where resume_id = "%s" limit 5' % ( resume_id) cur.execute(sqlp) profile = cur.fetchall() except: pdb.set_trace() for pro in profile: if not pro[0]: incomes = salaryp.search('0') else: incomes = salaryp.search(pro[0]) try: if not resume_id: continue pos_sql = 'select position_name from worktest where resume_id = "%s" order by end_time desc' % resume_id cur.execute(pos_sql) pos_rst = cur.fetchall() pro_position = pos_rst[0][0] except: pro_position = pro[2] if incomes: low_income = incomes.group(0) else: low_income = 0 pro_low_income = int(low_income) / 5000 pro_hisprojects = utils.discrement_unicode(pro[3]) pro_otherinfo = utils.discrement_unicode(pro[4]) pro_skills = utils.discrement_unicode(pro[1]) try: sql_work = 'select description from worktest where resume_id = "%s"' % resume_id except: traceback.print_exc() pdb.set_trace() cur.execute(sql_work) pro_decription = cur.fetchall() position_feature = get_position_feature(com_position, pro_position) com_feature = [] descrip_feature = get_description_feature(com_description, pro_hisprojects, \ pro_decription, pro_otherinfo, pro_skills) com_feature.append(com_low_income) com_feature.append(pro_low_income) pro_workage = pro[6] workage = salaryp.search(pro_workage) if workage: pro_workage = int(workage.group(0)) else: pro_workage = 0 pro_degree = pro[7] if '科' in pro_degree or '学' in pro_degree: pro_degree = 1 elif '硕' in pro_degree or '士' in pro_degree: pro_degree = 2 else: pro_degree = 0 com_feature.append(com_workage) com_feature.append(pro_workage - com_workage) com_feature.append(com_degree) com_feature.append(pro_degree - com_degree) com_feature.append(round(position_feature, 3)) com_feature += descrip_feature return com_feature
def generate_test_feature(cur, pos_id, resume_id): sql = 'select position_id, low_income, description, low_workage, position_name,\ workage, degree from companytest where position_id = %d' % pos_id cur.execute(sql) rst = cur.fetchall() for term in rst: com_low_income = term[1] pro_low_income = 0 com_position = utils.discrement_unicode(term[4]) com_workage = term[5] com_degree = term[6] pro_position = '' com_description = utils.discrement_unicode(term[2]) pro_decription = '' pro_hisprojects = '' pro_otherinfo = '' com_lst = [] if com_low_income < 500 and com_low_income > 0: com_low_income = (com_low_income * 8000) / 12 com_low_income = com_low_income / 5000 low_workage = term[5] if not low_workage: low_workage = 0 com_lst.append(low_workage) com_workage = term[5] com_degree = term[6] keywords = get_keywords( utils.discrement_unicode(term[2])) com_description = utils.discrement_unicode(term[2]) try: sqlp = 'select dessalary, skills, destitle, hisprojects, otherinfo, resume_id, workyear, latestdegree \ from profiletest where resume_id = "%s" limit 5' % (resume_id) cur.execute(sqlp) profile = cur.fetchall() except: pdb.set_trace() for pro in profile: if not pro[0]: incomes = salaryp.search('0') else: incomes = salaryp.search(pro[0]) try: if not resume_id: continue pos_sql = 'select position_name from worktest where resume_id = "%s" order by end_time desc' % resume_id cur.execute(pos_sql) pos_rst = cur.fetchall() pro_position = pos_rst[0][0] except: pro_position = pro[2] if incomes: low_income = incomes.group(0) else: low_income = 0 pro_low_income = int(low_income) / 5000 pro_hisprojects = utils.discrement_unicode(pro[3]) pro_otherinfo = utils.discrement_unicode(pro[4]) pro_skills = utils.discrement_unicode(pro[1]) try: sql_work = 'select description from worktest where resume_id = "%s"' % resume_id except: traceback.print_exc() pdb.set_trace() cur.execute(sql_work) pro_decription = cur.fetchall() position_feature = get_position_feature(com_position, pro_position) com_feature = [] descrip_feature = get_description_feature(com_description, pro_hisprojects, \ pro_decription, pro_otherinfo, pro_skills) com_feature.append(com_low_income) com_feature.append(pro_low_income) pro_workage = pro[6] workage = salaryp.search(pro_workage) if workage: pro_workage = int(workage.group(0)) else: pro_workage = 0 pro_degree = pro[7] if '科' in pro_degree or '学' in pro_degree: pro_degree = 1 elif '硕' in pro_degree or '士' in pro_degree: pro_degree = 2 else: pro_degree = 0 com_feature.append(com_workage) com_feature.append(pro_workage - com_workage) com_feature.append(com_degree) com_feature.append(pro_degree - com_degree) com_feature.append(round(position_feature, 3)) com_feature += descrip_feature return com_feature
try: conn = utils.persist.connection() cur = conn.cursor() # sql = 'select low_income, high_income, low_workage, high_workage, description, \ # position_name, naren_created from company where id > %d' % 459 sql = "select id, description from company where id > %d " % 624 cur.execute(sql) rst = cur.fetchall() pdb.set_trace() for rs in rst: # print utils.discrement_unicode(rs[1]) print rs[0] try: usql = """update company set description = "%s" where id = %d""" % (utils.discrement_unicode(rs[1]), rs[0]) cur.execute(usql) except: pdb.set_trace() traceback.print_exc() conn.commit() conn.close() except: traceback.print_exc() pdb.set_trace() conn.close() lu = time.time() print lu - start
topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} num = 0 for rs in rst: if not type_dct.has_key(rs[1].lower()): type_dct[rs[1].lower()] = num type_dct_r[num] = rs[1].lower() num += 1 for rs in rst: desc = rs[0] segs = jieba.cut(utils.discrement_unicode(desc), cut_all=False) tpwords = copy.deepcopy(topic_words) for seg in segs: if tpwords.has_key(seg.lower()): tpwords[seg.lower()] += 1 tpwordl = [] topicsy.append(type_dct[rs[1].lower()]) for key in sorted(tpwords.keys()): tpwordl.append(tpwords[key]) topicsx.append(tpwordl) pdb.set_trace() x = np.array(topicsx)
def get_feature(cur, feature_lines, flag, pos_id): if not pos_id: sql = 'select position_id, low_income, description, low_workage, position_name,\ workage, degree from company' else: sql = 'select position_id, low_income, description, low_workage, position_name,\ workage, degree from company where position_id = %d' % pos_id cur.execute(sql) rst = cur.fetchall() nummn = 0 for term in rst: com_low_income = term[1] pro_low_income = 0 com_position = utils.discrement_unicode(term[4]) pro_position = '' com_description = utils.discrement_unicode(term[2]) pro_decription = '' pro_hisprojects = '' pro_otherinfo = '' com_lst = [] if com_low_income < 500 and com_low_income > 0: com_low_income = (com_low_income * 8000) / 12 com_low_income = com_low_income / 5000 low_workage = term[5] if not low_workage: low_workage = 0 com_lst.append(low_workage) com_workage = term[5] com_degree = term[6] keywords = get_keywords(utils.discrement_unicode(term[2])) com_description = utils.discrement_unicode(term[2]) try: sqlp = 'select dessalary, skills, latesttitle, hisprojects, otherinfo, pf.resume_id, workyear, latestdegree, \ pr.pos_id, pr.resume_id from pos_resume as pr left join profile as pf on pr.resume_id = pf.resume_id \ where pr.train_flag = 0 and pr.pos_id = %d and pr.hr_confirm = %d' % (term[0], flag) cur.execute(sqlp) profile = cur.fetchall() except: pdb.set_trace() for pro in profile: nummn += 1 print nummn if not pro[0]: incomes = salaryp.search('0') else: incomes = salaryp.search(pro[0]) try: resume_id = pro[5] if not resume_id: continue pos_sql = 'select position_name from work where resume_id = %d order by end_time desc' % resume_id cur.execute(pos_sql) pos_rst = cur.fetchall() pro_position = utils.discrement_unicode(pos_rst[0][0]) except: pro_position = utils.discrement_unicode(pro[2]) if incomes: low_income = incomes.group(0) else: low_income = 0 pro_low_income = int(low_income) / 5000 pro_hisprojects = utils.discrement_unicode(pro[3]) pro_otherinfo = utils.discrement_unicode(pro[4]) pro_skills = utils.discrement_unicode(pro[1]) try: sql_work = 'select description from work where resume_id = "%s"' % resume_id except: traceback.print_exc() pdb.set_trace() cur.execute(sql_work) pro_decription = cur.fetchall() position_feature = get_position_feature(com_position, pro_position) com_feature = [] descrip_feature = get_description_feature(com_description, pro_hisprojects, \ pro_decription, pro_otherinfo, pro_skills) com_feature.append(com_low_income) com_feature.append(pro_low_income) pro_workage = pro[6] workage = salaryp.search(pro_workage) if workage: pro_workage = int(workage.group(0)) else: pro_workage = 0 pro_degree = utils.discrement_unicode(pro[7]) if '科' in pro_degree or '学' in pro_degree: pro_degree = 1 elif '硕' in pro_degree or '士' in pro_degree: pro_degree = 2 else: pro_degree = 0 com_feature.append(com_workage) com_feature.append(pro_workage - com_workage) com_feature.append(com_degree) com_feature.append(pro_degree - com_degree) com_feature.append(round(position_feature, 3)) com_feature += descrip_feature # pdb.set_trace() feature = utils.get_work_feature(cur, resume_id) com_feature = com_feature + feature com_feature.append(flag) feature_lines.append(','.join(map(lambda x: str(x), com_feature)))
rst = cur.fetchall() # topic_word = open('topic_word', 'r') topic_word = open('tfidfwords', 'r') topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} train_data = [] train_tags = [] for rs in rst: desc = rs[0] segs = jieba.cut(utils.discrement_unicode(desc).lower(), cut_all=False) tpword = copy.deepcopy(topic_words) train_words = [] for seg in segs: if tpword.has_key(seg.lower()): tpword[seg.lower()] += 1 segs = jieba.cut(utils.discrement_unicode(rs[2]).lower(), cut_all=False) for seg in segs: if tpword.has_key(seg.lower()): tpword[seg.lower()] += 1 # pdb.set_trace() # for key in sorted(tpword.keys()): for key in tpword.keys(): train_words.append(tpword[key])
start = time.time() try: conn = utils.persist.connection() cur = conn.cursor() sql = 'select description, position_name from company where type != "None"' cur.execute(sql) rst = cur.fetchall() transformer=TfidfTransformer() vectorizer=CountVectorizer() stopf = open('stopword', 'r') stopwords = pickle.load(stopf) train_data = [] for rs in rst: desc = utils.discrement_unicode(rs[0]) segs = jieba.cut(desc, cut_all=False) wordss = [] for seg in segs: if not stopwords.has_key(seg.lower()): wordss.append(seg.lower()) segs = jieba.cut(utils.discrement_unicode(rs[1]), cut_all=False) for seg in segs: if not stopwords.has_key(seg.lower()): for i in range(5): wordss.append(seg.lower()) train_data.append(' '.join(wordss)) tfidf=transformer.fit_transform(vectorizer.fit_transform(train_data))
try: conn = utils.persist.connection() cur = conn.cursor() # sql = 'select low_income, high_income, low_workage, high_workage, description, \ # position_name, naren_created from company where id > %d' % 459 sql = 'select id, description from company where id > %d ' % 624 cur.execute(sql) rst = cur.fetchall() pdb.set_trace() for rs in rst: # print utils.discrement_unicode(rs[1]) print rs[0] try: usql = '''update company set description = "%s" where id = %d''' % ( utils.discrement_unicode(rs[1]), rs[0]) cur.execute(usql) except: pdb.set_trace() traceback.print_exc() conn.commit() conn.close() except: traceback.print_exc() pdb.set_trace() conn.close() lu = time.time() print lu - start
sql = 'select description, type from company where type != "None"' cur.execute(sql) rst = cur.fetchall() topic_word = open('topic_word', 'r') topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} train_words = [] train_tags = [] for rs in rst: desc = rs[0] train_words.append(utils.discrement_unicode(desc)) train_tags.append(utils.discrement_unicode(rs[1])) pdb.set_trace() train_data = vectorize(train_words) clf = MultinomialNB(alpha=0.01) clf.fit(train_data, train_tags) sqll = 'select description, position_name, id from company where type = "None"' cur.execute(sqll) rst = cur.fetchall() test_words = [] pdb.set_trace() for rs in rst:
def get_feature(cur, feature_lines, flag): sql = 'select position_id, low_income, description, low_workage, position_name,\ workage, degree from company' cur.execute(sql) rst = cur.fetchall() nummn = 0 for term in rst: com_low_income = term[1] pro_low_income = 0 com_position = utils.discrement_unicode(term[4]) pro_position = '' com_description = utils.discrement_unicode(term[2]) pro_decription = '' pro_hisprojects = '' pro_otherinfo = '' com_lst = [] if com_low_income < 500 and com_low_income > 0: com_low_income = (com_low_income * 8000) / 12 com_low_income = com_low_income / 5000 low_workage = term[5] if not low_workage: low_workage = 0 com_lst.append(low_workage) com_workage = term[5] com_degree = term[6] keywords = get_keywords(utils.discrement_unicode(term[2])) com_description = utils.discrement_unicode(term[2]) try: sqlp = 'select dessalary, skills, latesttitle, hisprojects, otherinfo, pf.resume_id, workyear, latestdegree, \ pr.pos_id, pr.resume_id from pos_resume as pr left join profile as pf on pr.resume_id = pf.resume_id \ where pr.train_flag = 0 and pr.pos_id = %d and pr.hr_confirm = %d limit 5' % ( term[0], flag) cur.execute(sqlp) profile = cur.fetchall() except: pdb.set_trace() for pro in profile: nummn += 1 print nummn if not pro[0]: incomes = salaryp.search('0') else: incomes = salaryp.search(pro[0]) try: resume_id = pro[5] if not resume_id: continue pos_sql = 'select position_name from work where resume_id = %d order by end_time desc' % resume_id cur.execute(pos_sql) pos_rst = cur.fetchall() pro_position = utils.discrement_unicode(pos_rst[0][0]) except: pro_position = utils.discrement_unicode(pro[2]) if incomes: low_income = incomes.group(0) else: low_income = 0 pro_low_income = int(low_income) / 5000 pro_hisprojects = utils.discrement_unicode(pro[3]) pro_otherinfo = utils.discrement_unicode(pro[4]) pro_skills = utils.discrement_unicode(pro[1]) try: sql_work = 'select description from work where resume_id = "%s"' % resume_id except: traceback.print_exc() pdb.set_trace() cur.execute(sql_work) pro_decription = cur.fetchall() position_feature = get_position_feature(com_position, pro_position) com_feature = [] descrip_feature = get_description_feature(com_description, pro_hisprojects, \ pro_decription, pro_otherinfo, pro_skills) com_feature.append(com_low_income) com_feature.append(pro_low_income) pro_workage = pro[6] workage = salaryp.search(pro_workage) if workage: pro_workage = int(workage.group(0)) else: pro_workage = 0 pro_degree = utils.discrement_unicode(pro[7]) if '科' in pro_degree or '学' in pro_degree: pro_degree = 1 elif '硕' in pro_degree or '士' in pro_degree: pro_degree = 2 else: pro_degree = 0 com_feature.append(com_workage) com_feature.append(pro_workage - com_workage) com_feature.append(com_degree) com_feature.append(pro_degree - com_degree) com_feature.append(round(position_feature, 3)) com_feature += descrip_feature com_feature.append(flag) feature_lines.append(','.join(map(lambda x: str(x), com_feature))) set_train_flg(cur, term[0], resume_id)
sql = 'select description, type from company where type != "None"' cur.execute(sql) rst = cur.fetchall() topic_word = open('topic_word', 'r') topic_words = pickle.load(topic_word) topicsx = [] topicsy = [] type_dct = {} type_dct_r = {} train_words = [] train_tags = [] for rs in rst: desc = rs[0] train_words.append(utils.discrement_unicode(desc)) train_tags.append(utils.discrement_unicode(rs[1])) pdb.set_trace() train_data = vectorize(train_words) clf = MultinomialNB(alpha=0.01) clf.fit(train_data, train_tags) sqll = 'select description, position_name, id from company where type = "None"' cur.execute(sqll) rst = cur.fetchall() test_words = [] pdb.set_trace() for rs in rst:
cur.execute(sql) rst = cur.fetchall() # pdb.set_trace() for rs in rst: print rs[2] if rs[1] == '[]' or not rs[1]: continue # rsff3 = rs[1].replace(u'\u201c', '"') # rsff2 = rsff3.replace('\n', '') # rsff3 = utils.discrement_unicode(rs[1]) # pdb.set_trace() rsff3 = rs[1].replace('\n', '') rsff3 = rsff3.replace(u'\u201c', '"') rsff3 = rsff3.replace(u'\u2018', '') rsff3 = utils.discrement_unicode(rsff3) rsff2 = utils.convert_code(rsff3) rsff = eval(rsff2) # pdb.set_trace() for rsf in rsff: for key in rsf.keys(): rsf[key.decode('utf8')] = rsf[key].decode('utf8') # rsf.pop(key) for rsf in rsff: sql = 'insert into projects(name, start, end, description, resume_id, software,\ hardware, developtool, dudescription) values("%s", "%s", "%s", "%s", "%s", "%s",\ "%s", "%s", "%s")' % (rsf.get('name', ''), rsf.get('start_time', ''), rsf.get('end_time', ''),\ rsf.get(u'项目描述', ''), rs[0], rsf.get(u'软件环境', ''), rsf.get(u'硬件环境', ''), \ rsf.get(u'开发工具', ''), rsf.get(u'责任描述', '')) cur.execute(sql)