Python discrement_unicode 예제들, utils.discrement_unicode Python 예제들

예제 #1

0

파일 보기

파일: tfidfword.py 프로젝트: devilWwj/pyml

start = time.time()

try:
    conn = utils.persist.connection()
    cur = conn.cursor()
    sql = 'select description, position_name from company where type != "None"'
    cur.execute(sql)
    rst = cur.fetchall()
    transformer = TfidfTransformer()
    vectorizer = CountVectorizer()
    stopf = open('stopword', 'r')
    stopwords = pickle.load(stopf)
    train_data = []
    for rs in rst:
        desc = utils.discrement_unicode(rs[0])
        segs = jieba.cut(desc, cut_all=False)
        wordss = []
        for seg in segs:
            if not stopwords.has_key(seg.lower()):
                wordss.append(seg.lower())
        segs = jieba.cut(utils.discrement_unicode(rs[1]), cut_all=False)

        for seg in segs:
            if not stopwords.has_key(seg.lower()):
                for i in range(5):
                    wordss.append(seg.lower())

        train_data.append(' '.join(wordss))

    tfidf = transformer.fit_transform(vectorizer.fit_transform(train_data))

예제 #2

0

파일 보기

파일: check_profile.py 프로젝트: devilWwj/pyml

try:
    conn = utils.persist.connection()
    cur = conn.cursor()

    resume_id = '12227125-2'

    sql = 'select resumekeywords, dessalary, latestmajor, latestcollege,\
            latestdegree, workyear, latestcompany, latesttitle from profile where \
             resume_id = "%s"' % resume_id

    cur.execute(sql)
    rst = cur.fetchall()

    for rs in rst:
        for tem in rs:
            print utils.discrement_unicode(tem)

    sql = 'select unit_name, start_time, end_time, position_name, description from work where\
                                      resume_id = "%s"' % resume_id

    cur.execute(sql)
    rst = cur.fetchall()
    for rs in rst:
        for tem in rs:
            print utils.discrement_unicode(tem)

    conn.close()
except:
    traceback.print_exc()
    conn.close()

예제 #3

0

파일 보기

    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}
    num = 0
    for rs in rst:
        if not type_dct.has_key(rs[1].lower()):
            type_dct[rs[1].lower()] = num
            type_dct_r[num] = rs[1].lower()
            num += 1

    for rs in rst:
        desc = rs[0]
        segs = jieba.cut(utils.discrement_unicode(desc), cut_all=False)
        tpwords = copy.deepcopy(topic_words)
        for seg in segs:
            if tpwords.has_key(seg.lower()):
                tpwords[seg.lower()] += 1

        tpwordl = []
        topicsy.append(type_dct[rs[1].lower()])
        for key in sorted(tpwords.keys()):
            tpwordl.append(tpwords[key])

        topicsx.append(tpwordl)

    pdb.set_trace()

    x = np.array(topicsx)

예제 #4

0

파일 보기

파일: topic_words.py 프로젝트: fengkaicnic/pyml

    cur.execute(sql)
    stopf = open('stopword', 'rb')
    stopword = pickle.load(stopf)
    topics = []

    topic_words = {}

    rst = cur.fetchall()

    for rs in rst:
        desc = rs[0]
        pname = rs[1]
        ptype = rs[2]
        topics.append(ptype)
        topic_word = []
        segs = jieba.cut(utils.discrement_unicode(desc))

        for seg in segs:
            if stopword.has_key(seg):
                continue
            if not topic_words.has_key(seg.lower()):
                topic_words[seg.lower()] = 0
    pdb.set_trace()
    # print topic_words
    for word in topic_words:
        print word
    topic_word = open('topic_word', 'wb')
    pickle.dump(topic_words, topic_word)
except:
    traceback.print_exc()

예제 #5

0

파일 보기

파일: extra_project.py 프로젝트: devilWwj/pyml

    cur.execute(sql)
    rst = cur.fetchall()
    # pdb.set_trace()
    for rs in rst:
        print rs[2]
        if rs[1] == '[]' or not rs[1]:
            continue
        # rsff3 = rs[1].replace(u'\u201c', '"')
        # rsff2 = rsff3.replace('\n', '')
        # rsff3 = utils.discrement_unicode(rs[1])
        # pdb.set_trace()
        rsff3 = rs[1].replace('\n', '')

        rsff3 = rsff3.replace(u'\u201c', '"')
        rsff3 = rsff3.replace(u'\u2018', '')
        rsff3 = utils.discrement_unicode(rsff3)
        rsff2 = utils.convert_code(rsff3)
        rsff = eval(rsff2)
        # pdb.set_trace()
        for rsf in rsff:
            for key in rsf.keys():
                rsf[key.decode('utf8')] = rsf[key].decode('utf8')
                # rsf.pop(key)
        for rsf in rsff:
            sql = 'insert into projects(name, start, end, description, resume_id, software,\
                    hardware, developtool, dudescription) values("%s", "%s", "%s", "%s", "%s", "%s",\
                     "%s", "%s", "%s")'                                        % (rsf.get('name', ''), rsf.get('start_time', ''), rsf.get('end_time', ''),\
                    rsf.get(u'项目描述', ''), rs[0], rsf.get(u'软件环境', ''), rsf.get(u'硬件环境', ''), \
                        rsf.get(u'开发工具', ''), rsf.get(u'责任描述', ''))

            cur.execute(sql)

예제 #6

0

파일 보기

    rst = cur.fetchall()

    # topic_word = open('topic_word', 'r')
    topic_word = open('tfidfwords', 'r')
    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}

    train_data = []
    train_tags = []
    for rs in rst:
        desc = rs[0]
        segs = jieba.cut(utils.discrement_unicode(desc).lower(), cut_all=False)
        tpword = copy.deepcopy(topic_words)
        train_words = []
        for seg in segs:
            if tpword.has_key(seg.lower()):
                tpword[seg.lower()] += 1
        segs = jieba.cut(utils.discrement_unicode(rs[2]).lower(),
                         cut_all=False)

        for seg in segs:
            if tpword.has_key(seg.lower()):
                tpword[seg.lower()] += 1
        # pdb.set_trace()
        # for key in sorted(tpword.keys()):
        for key in tpword.keys():
            train_words.append(tpword[key])

예제 #7

0

파일 보기

파일: generate_feature.py 프로젝트: devilWwj/pyml

def generate_test_feature(cur, pos_id, resume_id):
    sql = 'select position_id, low_income, description, low_workage, position_name,\
            workage, degree from companytest where position_id = %d' % pos_id

    cur.execute(sql)
    rst = cur.fetchall()

    for term in rst:
        com_low_income = term[1]
        pro_low_income = 0
        com_position = utils.discrement_unicode(term[4])
        com_workage = term[5]
        com_degree = term[6]
        pro_position = ''
        com_description = utils.discrement_unicode(term[2])
        pro_decription = ''
        pro_hisprojects = ''
        pro_otherinfo = ''

        com_lst = []
        if com_low_income < 500 and com_low_income > 0:
            com_low_income = (com_low_income * 8000) / 12
        com_low_income = com_low_income / 5000
        low_workage = term[5]
        if not low_workage:
            low_workage = 0
        com_lst.append(low_workage)
        com_workage = term[5]
        com_degree = term[6]

        keywords = get_keywords(utils.discrement_unicode(term[2]))
        com_description = utils.discrement_unicode(term[2])

        try:
            sqlp = 'select dessalary, skills, destitle, hisprojects, otherinfo, resume_id, workyear, latestdegree \
                    from profiletest where resume_id = "%s" limit 5' % (
                resume_id)

            cur.execute(sqlp)
            profile = cur.fetchall()
        except:
            pdb.set_trace()
        for pro in profile:
            if not pro[0]:
                incomes = salaryp.search('0')
            else:
                incomes = salaryp.search(pro[0])
            try:
                if not resume_id:
                    continue
                pos_sql = 'select position_name from worktest where resume_id = "%s" order by end_time desc' % resume_id
                cur.execute(pos_sql)
                pos_rst = cur.fetchall()
                pro_position = pos_rst[0][0]
            except:
                pro_position = pro[2]
            if incomes:
                low_income = incomes.group(0)
            else:
                low_income = 0
            pro_low_income = int(low_income) / 5000
            pro_hisprojects = utils.discrement_unicode(pro[3])
            pro_otherinfo = utils.discrement_unicode(pro[4])
            pro_skills = utils.discrement_unicode(pro[1])
            try:
                sql_work = 'select description from worktest where resume_id = "%s"' % resume_id
            except:
                traceback.print_exc()
                pdb.set_trace()

            cur.execute(sql_work)

            pro_decription = cur.fetchall()

            position_feature = get_position_feature(com_position, pro_position)
            com_feature = []
            descrip_feature = get_description_feature(com_description, pro_hisprojects, \
                                                      pro_decription, pro_otherinfo, pro_skills)
            com_feature.append(com_low_income)
            com_feature.append(pro_low_income)
            pro_workage = pro[6]
            workage = salaryp.search(pro_workage)
            if workage:
                pro_workage = int(workage.group(0))
            else:
                pro_workage = 0
            pro_degree = pro[7]
            if '科' in pro_degree or '学' in pro_degree:
                pro_degree = 1
            elif '硕' in pro_degree or '士' in pro_degree:
                pro_degree = 2
            else:
                pro_degree = 0
            com_feature.append(com_workage)
            com_feature.append(pro_workage - com_workage)
            com_feature.append(com_degree)
            com_feature.append(pro_degree - com_degree)
            com_feature.append(round(position_feature, 3))
            com_feature += descrip_feature

            return com_feature

예제 #8

0

파일 보기

파일: generate_new_feature.py 프로젝트: fengkaicnic/pyml

def generate_test_feature(cur, pos_id, resume_id):
    sql = 'select position_id, low_income, description, low_workage, position_name,\
            workage, degree from companytest where position_id = %d' % pos_id

    cur.execute(sql)
    rst = cur.fetchall()

    for term in rst:
        com_low_income = term[1]
        pro_low_income = 0
        com_position = utils.discrement_unicode(term[4])
        com_workage = term[5]
        com_degree = term[6]
        pro_position = ''
        com_description = utils.discrement_unicode(term[2])
        pro_decription = ''
        pro_hisprojects = ''
        pro_otherinfo = ''

        com_lst = []
        if com_low_income < 500 and com_low_income > 0:
            com_low_income = (com_low_income * 8000) / 12
        com_low_income = com_low_income / 5000
        low_workage = term[5]
        if not low_workage:
            low_workage = 0
        com_lst.append(low_workage)
        com_workage = term[5]
        com_degree = term[6]

        keywords = get_keywords( utils.discrement_unicode(term[2]))
        com_description =  utils.discrement_unicode(term[2])

        try:
            sqlp = 'select dessalary, skills, destitle, hisprojects, otherinfo, resume_id, workyear, latestdegree \
                    from profiletest where resume_id = "%s" limit 5' % (resume_id)

            cur.execute(sqlp)
            profile = cur.fetchall()
        except:
            pdb.set_trace()
        for pro in profile:
            if not pro[0]:
                incomes = salaryp.search('0')
            else:
                incomes = salaryp.search(pro[0])
            try:
                if not resume_id:
                    continue
                pos_sql = 'select position_name from worktest where resume_id = "%s" order by end_time desc' % resume_id
                cur.execute(pos_sql)
                pos_rst = cur.fetchall()
                pro_position = pos_rst[0][0]
            except:
                pro_position = pro[2]
            if incomes:
                low_income = incomes.group(0)
            else:
                low_income = 0
            pro_low_income = int(low_income) / 5000
            pro_hisprojects =  utils.discrement_unicode(pro[3])
            pro_otherinfo =  utils.discrement_unicode(pro[4])
            pro_skills =  utils.discrement_unicode(pro[1])
            try:
                sql_work = 'select description from worktest where resume_id = "%s"' % resume_id
            except:
                traceback.print_exc()
                pdb.set_trace()

            cur.execute(sql_work)

            pro_decription = cur.fetchall()

            position_feature = get_position_feature(com_position, pro_position)
            com_feature = []
            descrip_feature = get_description_feature(com_description, pro_hisprojects, \
                                                      pro_decription, pro_otherinfo, pro_skills)
            com_feature.append(com_low_income)
            com_feature.append(pro_low_income)
            pro_workage = pro[6]
            workage = salaryp.search(pro_workage)
            if workage:
                pro_workage = int(workage.group(0))
            else:
                pro_workage = 0
            pro_degree = pro[7]
            if '科' in pro_degree or '学' in pro_degree:
                pro_degree = 1
            elif '硕' in pro_degree or '士' in pro_degree:
                pro_degree = 2
            else:
                pro_degree = 0
            com_feature.append(com_workage)
            com_feature.append(pro_workage - com_workage)
            com_feature.append(com_degree)
            com_feature.append(pro_degree - com_degree)
            com_feature.append(round(position_feature, 3))
            com_feature += descrip_feature

            return com_feature

예제 #9

0

파일 보기

파일: check_desc.py 프로젝트: fengkaicnic/pyml

try:
    conn = utils.persist.connection()
    cur = conn.cursor()
    # sql = 'select low_income, high_income, low_workage, high_workage, description, \
    # position_name, naren_created from company where id > %d' % 459
    sql = "select id, description from company where id > %d " % 624
    cur.execute(sql)

    rst = cur.fetchall()
    pdb.set_trace()
    for rs in rst:
        # print utils.discrement_unicode(rs[1])
        print rs[0]
        try:
            usql = """update company set description = "%s" where id = %d""" % (utils.discrement_unicode(rs[1]), rs[0])
            cur.execute(usql)
        except:
            pdb.set_trace()
            traceback.print_exc()

    conn.commit()
    conn.close()
except:
    traceback.print_exc()
    pdb.set_trace()
    conn.close()

lu = time.time()

print lu - start

예제 #10

0

파일 보기

파일: ldafun.py 프로젝트: fengkaicnic/pyml

    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}
    num = 0
    for rs in rst:
        if not type_dct.has_key(rs[1].lower()):
            type_dct[rs[1].lower()] = num
            type_dct_r[num] = rs[1].lower()
            num += 1

    for rs in rst:
        desc = rs[0]
        segs = jieba.cut(utils.discrement_unicode(desc), cut_all=False)
        tpwords = copy.deepcopy(topic_words)
        for seg in segs:
            if tpwords.has_key(seg.lower()):
                tpwords[seg.lower()] += 1

        tpwordl = []
        topicsy.append(type_dct[rs[1].lower()])
        for key in sorted(tpwords.keys()):
            tpwordl.append(tpwords[key])

        topicsx.append(tpwordl)

    pdb.set_trace()

    x = np.array(topicsx)

예제 #11

0

파일 보기

파일: generate_new_feature.py 프로젝트: fengkaicnic/pyml

def get_feature(cur, feature_lines, flag, pos_id):

    if not pos_id:
        sql = 'select position_id, low_income, description, low_workage, position_name,\
            workage, degree from company'
    else:
        sql = 'select position_id, low_income, description, low_workage, position_name,\
            workage, degree from company where position_id = %d' % pos_id

    cur.execute(sql)
    rst = cur.fetchall()
    nummn = 0
    for term in rst:
        com_low_income = term[1]
        pro_low_income = 0
        com_position = utils.discrement_unicode(term[4])
        pro_position = ''
        com_description = utils.discrement_unicode(term[2])
        pro_decription = ''
        pro_hisprojects = ''
        pro_otherinfo = ''

        com_lst = []
        if com_low_income < 500 and com_low_income > 0:
            com_low_income = (com_low_income * 8000) / 12
        com_low_income = com_low_income / 5000
        low_workage = term[5]
        if not low_workage:
            low_workage = 0
        com_lst.append(low_workage)
        com_workage = term[5]
        com_degree = term[6]

        keywords = get_keywords(utils.discrement_unicode(term[2]))
        com_description = utils.discrement_unicode(term[2])

        try:
            sqlp = 'select dessalary, skills, latesttitle, hisprojects, otherinfo, pf.resume_id, workyear, latestdegree, \
                    pr.pos_id, pr.resume_id from pos_resume as pr left join profile as pf on pr.resume_id = pf.resume_id \
                    where pr.train_flag = 0 and pr.pos_id = %d and pr.hr_confirm = %d' % (term[0], flag)

            cur.execute(sqlp)
            profile = cur.fetchall()
        except:
            pdb.set_trace()
        for pro in profile:
            nummn += 1
            print nummn
            if not pro[0]:
                incomes = salaryp.search('0')
            else:
                incomes = salaryp.search(pro[0])
            try:
                resume_id = pro[5]
                if not resume_id:
                    continue
                pos_sql = 'select position_name from work where resume_id = %d order by end_time desc' % resume_id
                cur.execute(pos_sql)
                pos_rst = cur.fetchall()
                pro_position = utils.discrement_unicode(pos_rst[0][0])
            except:
                pro_position = utils.discrement_unicode(pro[2])
            if incomes:
                low_income = incomes.group(0)
            else:
                low_income = 0
            pro_low_income = int(low_income) / 5000
            pro_hisprojects =  utils.discrement_unicode(pro[3])
            pro_otherinfo =  utils.discrement_unicode(pro[4])
            pro_skills =  utils.discrement_unicode(pro[1])
            try:
                sql_work = 'select description from work where resume_id = "%s"' % resume_id
            except:
                traceback.print_exc()
                pdb.set_trace()

            cur.execute(sql_work)

            pro_decription = cur.fetchall()

            position_feature = get_position_feature(com_position, pro_position)
            com_feature = []
            descrip_feature = get_description_feature(com_description, pro_hisprojects, \
                                                      pro_decription, pro_otherinfo, pro_skills)
            com_feature.append(com_low_income)
            com_feature.append(pro_low_income)
            pro_workage = pro[6]
            workage = salaryp.search(pro_workage)
            if workage:
                pro_workage = int(workage.group(0))
            else:
                pro_workage = 0
            pro_degree = utils.discrement_unicode(pro[7])
            if '科' in pro_degree or '学' in pro_degree:
                pro_degree = 1
            elif '硕' in pro_degree or '士' in pro_degree:
                pro_degree = 2
            else:
                pro_degree = 0
            com_feature.append(com_workage)
            com_feature.append(pro_workage - com_workage)
            com_feature.append(com_degree)
            com_feature.append(pro_degree - com_degree)
            com_feature.append(round(position_feature, 3))
            com_feature += descrip_feature
            # pdb.set_trace()
            feature = utils.get_work_feature(cur, resume_id)
            com_feature = com_feature + feature
            com_feature.append(flag)

            feature_lines.append(','.join(map(lambda x: str(x), com_feature)))

예제 #12

0

파일 보기

파일: baysinfun1.py 프로젝트: fengkaicnic/pyml

    rst = cur.fetchall()

    # topic_word = open('topic_word', 'r')
    topic_word = open('tfidfwords', 'r')
    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}

    train_data = []
    train_tags = []
    for rs in rst:
        desc = rs[0]
        segs = jieba.cut(utils.discrement_unicode(desc).lower(), cut_all=False)
        tpword = copy.deepcopy(topic_words)
        train_words = []
        for seg in segs:
            if tpword.has_key(seg.lower()):
                tpword[seg.lower()] += 1
        segs = jieba.cut(utils.discrement_unicode(rs[2]).lower(), cut_all=False)

        for seg in segs:
            if tpword.has_key(seg.lower()):
                tpword[seg.lower()] += 1
        # pdb.set_trace()
        # for key in sorted(tpword.keys()):
        for key in tpword.keys():
            train_words.append(tpword[key])

예제 #13

0

파일 보기

파일: tfidfword.py 프로젝트: fengkaicnic/pyml

start = time.time()

try:
    conn = utils.persist.connection()
    cur = conn.cursor()
    sql = 'select description, position_name from company where type != "None"'
    cur.execute(sql)
    rst = cur.fetchall()
    transformer=TfidfTransformer()
    vectorizer=CountVectorizer()
    stopf = open('stopword', 'r')
    stopwords = pickle.load(stopf)
    train_data = []
    for rs in rst:
        desc = utils.discrement_unicode(rs[0])
        segs = jieba.cut(desc, cut_all=False)
        wordss = []
        for seg in segs:
            if not stopwords.has_key(seg.lower()):
                wordss.append(seg.lower())
        segs = jieba.cut(utils.discrement_unicode(rs[1]), cut_all=False)

        for seg in segs:
            if not stopwords.has_key(seg.lower()):
                for i in range(5):
                    wordss.append(seg.lower())

        train_data.append(' '.join(wordss))

    tfidf=transformer.fit_transform(vectorizer.fit_transform(train_data))

예제 #14

0

파일 보기

파일: check_desc.py 프로젝트: devilWwj/pyml

try:
    conn = utils.persist.connection()
    cur = conn.cursor()
    # sql = 'select low_income, high_income, low_workage, high_workage, description, \
    # position_name, naren_created from company where id > %d' % 459
    sql = 'select id, description from company where id > %d ' % 624
    cur.execute(sql)

    rst = cur.fetchall()
    pdb.set_trace()
    for rs in rst:
        # print utils.discrement_unicode(rs[1])
        print rs[0]
        try:
            usql = '''update company set description = "%s" where id = %d''' % (
                utils.discrement_unicode(rs[1]), rs[0])
            cur.execute(usql)
        except:
            pdb.set_trace()
            traceback.print_exc()

    conn.commit()
    conn.close()
except:
    traceback.print_exc()
    pdb.set_trace()
    conn.close()

lu = time.time()

print lu - start

예제 #15

0

파일 보기

파일: baysinfun.py 프로젝트: fengkaicnic/pyml

    sql = 'select description, type from company where type != "None"'
    cur.execute(sql)
    rst = cur.fetchall()

    topic_word = open('topic_word', 'r')
    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}
    train_words = []
    train_tags = []
    for rs in rst:
        desc = rs[0]
        train_words.append(utils.discrement_unicode(desc))
        train_tags.append(utils.discrement_unicode(rs[1]))
    pdb.set_trace()
    train_data = vectorize(train_words)
    clf = MultinomialNB(alpha=0.01)

    clf.fit(train_data, train_tags)

    sqll = 'select description, position_name, id from company where type = "None"'

    cur.execute(sqll)
    rst = cur.fetchall()

    test_words = []
    pdb.set_trace()
    for rs in rst:

예제 #16

0

파일 보기

파일: generate_feature.py 프로젝트: devilWwj/pyml

def get_feature(cur, feature_lines, flag):
    sql = 'select position_id, low_income, description, low_workage, position_name,\
            workage, degree from company'

    cur.execute(sql)
    rst = cur.fetchall()
    nummn = 0
    for term in rst:
        com_low_income = term[1]
        pro_low_income = 0
        com_position = utils.discrement_unicode(term[4])
        pro_position = ''
        com_description = utils.discrement_unicode(term[2])
        pro_decription = ''
        pro_hisprojects = ''
        pro_otherinfo = ''

        com_lst = []
        if com_low_income < 500 and com_low_income > 0:
            com_low_income = (com_low_income * 8000) / 12
        com_low_income = com_low_income / 5000
        low_workage = term[5]
        if not low_workage:
            low_workage = 0
        com_lst.append(low_workage)
        com_workage = term[5]
        com_degree = term[6]

        keywords = get_keywords(utils.discrement_unicode(term[2]))
        com_description = utils.discrement_unicode(term[2])

        try:
            sqlp = 'select dessalary, skills, latesttitle, hisprojects, otherinfo, pf.resume_id, workyear, latestdegree, \
                    pr.pos_id, pr.resume_id from pos_resume as pr left join profile as pf on pr.resume_id = pf.resume_id \
                    where pr.train_flag = 0 and pr.pos_id = %d and pr.hr_confirm = %d limit 5' % (
                term[0], flag)

            cur.execute(sqlp)
            profile = cur.fetchall()
        except:
            pdb.set_trace()
        for pro in profile:
            nummn += 1
            print nummn
            if not pro[0]:
                incomes = salaryp.search('0')
            else:
                incomes = salaryp.search(pro[0])
            try:
                resume_id = pro[5]
                if not resume_id:
                    continue
                pos_sql = 'select position_name from work where resume_id = %d order by end_time desc' % resume_id
                cur.execute(pos_sql)
                pos_rst = cur.fetchall()
                pro_position = utils.discrement_unicode(pos_rst[0][0])
            except:
                pro_position = utils.discrement_unicode(pro[2])
            if incomes:
                low_income = incomes.group(0)
            else:
                low_income = 0
            pro_low_income = int(low_income) / 5000
            pro_hisprojects = utils.discrement_unicode(pro[3])
            pro_otherinfo = utils.discrement_unicode(pro[4])
            pro_skills = utils.discrement_unicode(pro[1])
            try:
                sql_work = 'select description from work where resume_id = "%s"' % resume_id
            except:
                traceback.print_exc()
                pdb.set_trace()

            cur.execute(sql_work)

            pro_decription = cur.fetchall()

            position_feature = get_position_feature(com_position, pro_position)
            com_feature = []
            descrip_feature = get_description_feature(com_description, pro_hisprojects, \
                                                      pro_decription, pro_otherinfo, pro_skills)
            com_feature.append(com_low_income)
            com_feature.append(pro_low_income)
            pro_workage = pro[6]
            workage = salaryp.search(pro_workage)
            if workage:
                pro_workage = int(workage.group(0))
            else:
                pro_workage = 0
            pro_degree = utils.discrement_unicode(pro[7])
            if '科' in pro_degree or '学' in pro_degree:
                pro_degree = 1
            elif '硕' in pro_degree or '士' in pro_degree:
                pro_degree = 2
            else:
                pro_degree = 0
            com_feature.append(com_workage)
            com_feature.append(pro_workage - com_workage)
            com_feature.append(com_degree)
            com_feature.append(pro_degree - com_degree)
            com_feature.append(round(position_feature, 3))
            com_feature += descrip_feature
            com_feature.append(flag)
            feature_lines.append(','.join(map(lambda x: str(x), com_feature)))
            set_train_flg(cur, term[0], resume_id)

예제 #17

0

파일 보기

파일: baysinfun.py 프로젝트: devilWwj/pyml

    sql = 'select description, type from company where type != "None"'
    cur.execute(sql)
    rst = cur.fetchall()

    topic_word = open('topic_word', 'r')
    topic_words = pickle.load(topic_word)
    topicsx = []
    topicsy = []

    type_dct = {}
    type_dct_r = {}
    train_words = []
    train_tags = []
    for rs in rst:
        desc = rs[0]
        train_words.append(utils.discrement_unicode(desc))
        train_tags.append(utils.discrement_unicode(rs[1]))
    pdb.set_trace()
    train_data = vectorize(train_words)
    clf = MultinomialNB(alpha=0.01)

    clf.fit(train_data, train_tags)

    sqll = 'select description, position_name, id from company where type = "None"'

    cur.execute(sqll)
    rst = cur.fetchall()

    test_words = []
    pdb.set_trace()
    for rs in rst:

예제 #18

0

파일 보기

파일: extra_project.py 프로젝트: fengkaicnic/pyml

    cur.execute(sql)
    rst = cur.fetchall()
    # pdb.set_trace()
    for rs in rst:
        print rs[2]
        if rs[1] == '[]' or not rs[1]:
            continue
        # rsff3 = rs[1].replace(u'\u201c', '"')
        # rsff2 = rsff3.replace('\n', '')
        # rsff3 = utils.discrement_unicode(rs[1])
        # pdb.set_trace()
        rsff3 = rs[1].replace('\n', '')

        rsff3 = rsff3.replace(u'\u201c', '"')
        rsff3 = rsff3.replace(u'\u2018', '')
        rsff3 = utils.discrement_unicode(rsff3)
        rsff2 = utils.convert_code(rsff3)
        rsff = eval(rsff2)
        # pdb.set_trace()
        for rsf in rsff:
            for key in rsf.keys():
                rsf[key.decode('utf8')] = rsf[key].decode('utf8')
                # rsf.pop(key)
        for rsf in rsff:
            sql = 'insert into projects(name, start, end, description, resume_id, software,\
                    hardware, developtool, dudescription) values("%s", "%s", "%s", "%s", "%s", "%s",\
                     "%s", "%s", "%s")' % (rsf.get('name', ''), rsf.get('start_time', ''), rsf.get('end_time', ''),\
                    rsf.get(u'项目描述', ''), rs[0], rsf.get(u'软件环境', ''), rsf.get(u'硬件环境', ''), \
                        rsf.get(u'开发工具', ''), rsf.get(u'责任描述', ''))

            cur.execute(sql)