示例#1
0
def update_status_db(et_info_url):
    if et_info_url['etwebsite']:
        print utils.current_time(), '正在更新状态表数据status=3......'
        conn = utils.get_local_db()
        et_status = {}
        et_status['etid'] = et_info_url['etid']
        et_status['url_status'] = 3
        update_db(conn, et_status, 'et_info_status')
    else:
        print utils.current_time(), '正在更新状态表数据status=2......'
        conn = utils.get_local_db()
        et_status = {}
        et_status['etid'] = et_info_url['etid']
        et_status['url_status'] = 2
        update_db(conn, et_status, 'et_info_status')
示例#2
0
def get_lt_etid():
    logging.info('12. 正在获取其他的企业 ')
    print utils.current_time(), '建立odps链接..'
    o = ODPS('LTAIzEuNzcL6qJJ8', 'eUAgj9ijhWCvOQ3w5Uv3FkwhNxvPF2',
             'database_test', 'http://service.odps.aliyun.com/api')
    print utils.current_time(), '进行查询...'
    pt = time.strftime('%Y%m%d', time.localtime(int(time.time() - 86400)))
    res = o.execute_sql(
        "select distinct etid from et_jobs where pt='{}' and isheadhunter=1".
        format(pt))
    print utils.current_time(), '处理查询结果...'
    etid_set = set()
    conn = utils.get_local_db()
    addtime = int(time.time())
    cnt = 0
    with res.open_reader() as reader:
        print utils.current_time(), '共需处理{}条!'.format(reader.count)
        for record in reader:
            etid_set.add((record['etid'], ))
            if len(etid_set) >= 1000:
                conn.executemany(
                    "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)"
                    .format(addtime), list(etid_set))
                cnt += 1000
                print utils.current_time(), '当前已写入{}条!'.format(cnt)
                etid_set.clear()
    if len(etid_set) > 0:
        conn.executemany(
            "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)"
            .format(addtime), list(etid_set))
        cnt += len(etid_set)
        print utils.current_time(), '当前已写入{}条!'.format(cnt)
    conn.close()
    return reader.count
示例#3
0
def get_36kr_etid():
    logging.info('10. 正在获取36hr的企业 ')
    print utils.current_time(), '建立数据库连接...'
    conn = utils.get_read_db(db='contact_datastore')
    print utils.current_time(), '查询需要采集的etid...'
    res = conn.query("select etid from dt_daily_36kr")
    conn.close()
    print utils.current_time(), '查询完成!'
    insert_list = []
    addtime = int(time.time())
    for x in res:
        insert_list.append([x['etid'], addtime])
    print utils.current_time(), '准备写入数据库...'
    conn = utils.get_local_db()
    total = len(insert_list)
    print utils.current_time(), '共需写入', total, '条!'
    for i in range(0, total, 1000):
        start = i
        end = min(start + 1000, total)
        conn.executemany(
            "insert into et_info_status(etid,addtime) values(%s,%s)on duplicate key update etid=values(etid), addtime=values(addtime)",
            insert_list[start:end])
        print utils.current_time(), '当前写入 {}/{}!'.format(end, total)
    conn.close()
    print '写入完成!'
    return total
示例#4
0
def extract_all(use_random_forest):
    if use_random_forest:
        emails = rf_model()
        emails = [email for email in emails if email[0] != 'negatives_clean']
    else:
        db = utils.get_local_db()
        for collection in db.collection_names():
            if collection != 'negatives_clean':
                for record in db.get_collection(collection).find():
                    emails.append([collection] + [record['Text']])

    # find features for each email
    email_data = []
    for email_set in emails:
        email = email_set[1]
        fields = features[email_set[0]]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email = nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})
        email_data.append([email_set[0], email, matches])
    return email_data
示例#5
0
def get_etid():
    """
    获取未处理的etid
    :return:
    """
    logging.info('%s 本地表中读取所有未处理的etid...' % utils.current_time())
    conn = utils.get_local_db()
    result = conn.query(
        "select etid from et_info_status where url_status=1 limit 200")
    conn.close()
    return result
示例#6
0
def rf_model():
    percent_training = .70  # proportion of data to use for training

    #get emails from local mongodb
    emails = []
    db = utils.get_local_db()
    for collection in db.collection_names():
        for record in db.get_collection(collection).find():
            emails.append([collection] + [record['Text']])

    # shuffle and split emails
    random.shuffle(emails)
    training_set = emails[:int(percent_training * len(emails))]
    testing_set = emails[int(percent_training * len(emails)):]
    training_labels = [row[0] for row in training_set]
    training_data = [row[1] for row in training_set]
    testing_data = [row[1] for row in testing_set]

    # tf-idf vectorize training set
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(training_data)
    X = X.toarray()

    # tf-idf vectorize testing set
    vectorized_testing_data = [
        vectorizer.transform([email]) for email in testing_data
    ]
    total = len(vectorized_testing_data)

    # create random forest
    forest = RandomForestClassifier(n_estimators=int(sqrt(len(X[0]))) + 1)
    forest.fit(X, training_labels)

    # generate and return predictions
    tagged_emails = []
    for i in range(total):
        tagged_emails.append(
            [forest.predict(vectorized_testing_data[i])[0], testing_data[i]])

    return tagged_emails
示例#7
0
def rf_categorize(email):
    # get training corpus
    emails = []
    db = utils.get_local_db()
    for collection in db.collection_names():
        for record in db.get_collection(collection).find():
            emails.append([collection] + [record['Text']])

    # vectorize corpus
    labels = [row[0] for row in emails]
    data = [row[1] for row in emails]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    X = X.toarray()

    # vectorize input
    email_vector = vectorizer.transform([email])

    # create random forest and return prediction
    forest = RandomForestClassifier(n_estimators=int(sqrt(len(X[0]))) + 1)
    forest.fit(X, labels)
    return forest.predict(email_vector)[0]
import utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score
from math import sqrt

if __name__ == '__main__':
    num_samples = 100  # number of random forests to compute and then average

    # get emails from local mongodb
    emails = []
    db = utils.get_local_db()
    for collection in db.collection_names():
        for record in db.get_collection(collection).find():
            emails.append([collection] + [record['Text']])

    # create labels and vectorize data
    labels = [row[0] for row in emails]
    vectorizer = TfidfVectorizer()
    data = vectorizer.fit_transform([row[1] for row in emails]).toarray()

    # create random forst and perform cross validation
    forest = RandomForestClassifier(n_estimators=int(sqrt(len(data[0]))) + 1)
    scores = cross_val_score(forest, data, labels, cv=num_samples)

    # write output to file
    output = open('random_forest_cross_validation.txt', 'w')
    for i in range(len(scores)):
        print(str(i) + ": " + str(scores[i]), file=output)
    output.close()