def parse_collaboration_info(info):
    collas = info.split(',')
    pub = {}
    for col in collas:
        if len(col) > 0:
            if col in pub:
                pub[col] += 1
            else:
                pub[col] = 1

    res = []
    for k in pub:
        res.append([k, pub[k]])
    res = sort(res)
    if len(res) > 6:
        res = res[0:6]

    for i in range(len(res)):
        affid = res[i][0]
        try:
            Cursor.execute(
                'select name from affiliation where id = {};'.format(affid))
            res[i][0] = Cursor.fetchone()[0]
            res[i] = {
                "affiliation_id": affid,
                "affiliation_name": res[i][0],
                "collaboration_count": res[i][1]
            }
        except Exception as e:
            traceback.print_exc()
            print(e)
    return res
Пример #2
0
 def _load_affiliation_ids(self):
     sql = '''SELECT id FROM affiliation'''
     Cursor.execute(sql)
     raw_result = list(Cursor.fetchall())
     self.affiliation_ids = [i[0] for i in raw_result]
     print("{} affiliation_total_count: {}".format(
         time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
         len(self.affiliation_ids)))
def update_author_collaboration_job():
    sql = 'SELECT id FROM author'
    Cursor.execute(sql)
    author_list = list(map(lambda x: x[0], list(Cursor.fetchall())))
    update_sql = '''
        INSERT INTO author_collaboration(start_id,end_id,distance,predict_collaboration)
        VALUES (%s,%s,%s,%s)
    '''
    author_collaboration_list = []
    wfile = open(
        "/Users/Karl/Desktop/SoftwareExercise/authorCollaboration.txt",
        "a+",
        encoding="utf-8")
    for authors in chunks(author_list, 500):
        start_time = time.time()
        for author in authors:
            with Neo4jDriver.session() as session:
                res = session.read_transaction(searchCoAuthor, author)
                # data = []
                # for record in res:
                #     data.append(record["authorId"])
                # author_collaboration_dict[author]=data
                #     wfile.write(json.dumps(author_collaboration_dict, indent=4))
                #     author_collaboration_dict.clear()
                #     end_time = time.time()
                #     duration = end_time - start_time
                #     print('update_author_collaboration_job runtime is:{0:.3f}s'.format(duration))
                # wfile.close()

                for coAuthor in res:
                    jaccrdDistance = computeJaccrdDistance(
                        author, coAuthor["authorId"])
                    print((author, coAuthor["authorId"],
                           round(jaccrdDistance[0],
                                 2), json.dumps(jaccrdDistance[1])))
                    author_collaboration_list.append(
                        (author, coAuthor["authorId"],
                         round(jaccrdDistance[0],
                               2), json.dumps(jaccrdDistance[1])))
        try:
            Cursor.executemany(update_sql, author_collaboration_list)
            Connection.commit()
        except Exception as e:
            print(e)
            Connection.rollback()
        end_time = time.time()
        duration = end_time - start_time
        print('update_author_collaboration_job 500 runtime is:{0:.3f}s'.format(
            duration))
        time.sleep(1)
Пример #4
0
def update_one_affiliation_year_count(id, pipe):
    try:
        Cursor.execute('''
            select
                group_concat(concat(year(art.date)) separator ",")
            from article art, affiliation_article afar
                where afar.article_id = art.id and afar.affiliation_id = 
        ''' + id + ";")
        raw_result = Cursor.fetchone()[0]
        if raw_result:
            raw_result = raw_result.strip()
        res = json.dumps(parseInfo(raw_result))
        pipe.set(cache_const.AFFILIATION_YEAR_COUNT.format(id), res)
    except Exception as e:
        traceback.print_exc()
        print(e)
def update_one_affiliation_collaboration(id, pipe):
    sql = '''
        select group_concat(aff2.affiliation_id)
        from affiliation_article aff1, affiliation_article aff2
        where aff1.affiliation_id <> aff2.affiliation_id and aff1.article_id = aff2.article_id and aff1.affiliation_id={}
    '''.format(id)

    try:
        Cursor.execute(sql)
        inf = Cursor.fetchone()[0]
        if inf and len(inf) > 0:
            res = parse_collaboration_info(inf)
            pipe.set(AFFILIATION_COLLABORATION_PUBLICATION_COUNT.format(id),
                     json.dumps(res))
    except Exception as e:
        traceback.print_exc()
        print(e)
Пример #6
0
def update_affiliation_keyword_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),
                                  key=lambda x: x[0],
                                  reverse=False)
    related_keyword_dict = {}
    sql = '''SELECT keyword_id,keyword_desc,COUNT(article_id)AS num FROM keyword_article
        WHERE article_id IN %s
        GROUP BY keyword_id,keyword_desc
        ORDER BY num DESC'''
    for affiliations_articles in chunks(related_article_list, 500):
        related_dict = {}
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]

            #机构没有对应的文章
            if not articles or len(articles) == 0:
                continue

            Cursor.execute(sql, (articles, ))
            raw_result = list(Cursor.fetchall())
            if raw_result is None:
                continue
            keywords = list(map(parseKeyword, raw_result))
            related_dict[affiliation_id] = keywords
        related_keyword_dict.update(related_dict)
    print("{} related_keyword_dict_len: {}".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
        len(related_keyword_dict)))
    pipeline = RedisTemplate.pipeline()
    for articles in chunks(related_article_list, 500):
        for article in articles:
            article_key = AFFILIATION_RELATED_KEYWORD_KEY_TEMPLATE.format(
                article[0])
            keywords = related_keyword_dict.get(article[0])
            if keywords:
                pipeline.set(article_key, json.dumps(keywords))
        pipeline.execute()
        time.sleep(1)
    print("{} update_affiliation_keyword_job".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
def update_affiliation_new_article_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),key=lambda x:x[0],reverse=False)
    related_new_article_dict = {}
    sql = '''SELECT id FROM article
            WHERE id IN %s
            ORDER BY date DESC LIMIT 1'''
    for affiliations_articles in chunks(related_article_list,500):
        related_dict = {}
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]

            #机构没有对应的文章
            if not articles or len(articles)==0:
                continue

            Cursor.execute(sql, (articles,))
            raw_result = list(Cursor.fetchone())
            if raw_result is None:
                continue
            article_id = raw_result[0]
            related_dict[affiliation_id] = article_id
        related_new_article_dict.update(related_dict)
    print("{} related_new_article_dict_len: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
                                                             len(related_new_article_dict)))
    pipeline = RedisTemplate.pipeline()
    for articles in chunks(related_article_list,500):
        for article in articles:
            article_key = AFFILIATION_RELATED_NEW_ARTICLE_ID_KEY_TEMPLATE.format(article[0])
            new_article_id = related_new_article_dict.get(article[0])
            if new_article_id:
                pipeline.set(article_key,json.dumps(new_article_id))
        pipeline.execute()
        time.sleep(1)
    print("{} update_affiliation_new_article_job finished".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
Пример #8
0
    def _load_related_article_dict(self):

        for ids in chunks(self.affiliation_ids, 500):
            sql = '''
                    SELECT affiliation_id, group_concat(article_id) as article_ids
                    FROM affiliation_article
                    WHERE affiliation_id IN %s
                    GROUP BY affiliation_id
                '''
            Cursor.execute(sql, (ids, ))
            raw_result = list(Cursor.fetchall())
            related_dict = {}
            for info in raw_result:
                if info is None:
                    continue
                if info[1] is None or len(info) == 0:
                    self.related_article_dict[info[0]] = []
                else:
                    related_dict[info[0]] = info[1].split(',')
            self.related_article_dict.update(related_dict)
            time.sleep(1)
        print("{} related_article_dict_len: {}".format(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            len(self.related_article_dict)))
def computeAffiliationDistance(start, end):
    intersection_sql = '''
    SELECT COUNT(DISTINCT a1.affiliation_id) FROM acmieee.affiliation_author a1,acmieee.affiliation_author a2
    WHERE a1.author_id = %s AND a2.author_id = %s AND a1.affiliation_id = a2.affiliation_id
    '''
    union_sql = '''
    SELECT COUNT(DISTINCT affiliation_id) FROM acmieee.affiliation_author 
    WHERE author_id = %s OR author_id = %s
    '''

    Cursor.execute(intersection_sql, (str(start), str(end)))
    intersection_num = Cursor.fetchone()[0]

    Cursor.execute(union_sql, (str(start), str(end)))
    union_num = Cursor.fetchone()[0]

    if union_num == 0:
        return 1

    return 1 - intersection_num / union_num
def computeDirectionDistance(start, end):
    intersection_sql = '''
    SELECT group_concat(DISTINCT a1.keyword_desc separator "\t") FROM acmieee.keyword_author a1,acmieee.keyword_author a2
    WHERE a1.author_id = %s AND a2.author_id = %s AND a1.keyword_id = a2.keyword_id
    '''
    union_sql = '''
    SELECT COUNT(DISTINCT keyword_id) FROM acmieee.keyword_author 
    WHERE author_id = %s OR author_id = %s
    '''

    Cursor.execute(intersection_sql, (str(start), str(end)))
    intersection = Cursor.fetchone()[0]
    intersection_num = 0
    predictDirections = []
    if intersection:
        predictDirections = intersection.split("\t")
        intersection_num = len(predictDirections)
    Cursor.execute(union_sql, (str(start), str(end)))
    union_num = Cursor.fetchone()[0]

    if union_num == 0:
        return 1

    return [1 - intersection_num / union_num, predictDirections]
def update_affiliation_database_job():
    affiliation = AffiliationLoader()
    affiliation.get_affiliation_data()
    related_article_list = sorted(affiliation.related_article_dict.items(),
                                  key=lambda x: x[0],
                                  reverse=False)
    affiliation_info_list = []
    sql = '''
                        SELECT aff.name,AVG(art.citation_count),SUM(art.citation_count),
                        COUNT(art.id),MIN(YEAR(art.date)),MAX(YEAR(art.date)),
                        COUNT(art.pdf_link),AVG(art.total_usage-art.citation_count)
                        FROM article art,affiliation aff
                        WHERE art.id IN %s
                        AND aff.id = %s
                '''
    back_up_sql = '''
                        SELECT aff.name
                        FROM affiliation aff
                        WHERE aff.id = %s
                '''
    update_sql = '''
                        INSERT INTO affiliation_info
                        (affiliation_id,affiliation_name,average_citation_per_article,
                        citation_count,publication_count,start_year,end_year,
                        available_download,average_download_per_article,
                        create_time,update_time)
                        VALUES 
                        (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                        ON DUPLICATE KEY
                        UPDATE affiliation_name = VALUES (affiliation_name),
                        average_citation_per_article = VALUES (average_citation_per_article),
                        citation_count = VALUES (citation_count),
                        publication_count = VALUES(publication_count),
                        start_year = VALUES (start_year),
                        end_year = VALUES (end_year),
                        available_download = VALUES (available_download),
                        average_download_per_article = VALUES (average_download_per_article),
                        update_time = VALUES (update_time)
                '''
    for affiliations_articles in chunks(related_article_list, 500):
        for affiliation_articles in affiliations_articles:
            affiliation_id = affiliation_articles[0]
            articles = affiliation_articles[1]
            update_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            if not articles or len(articles) == 0:
                Cursor.execute(back_up_sql, (affiliation_id, ))
                affiliation_name = raw_result[0]
                affiliation_info_list.append(
                    (affiliation_id, affiliation_name, 0.0, 0, 0, -1, -1, 0,
                     0.0, update_time, update_time))
                continue
            Cursor.execute(sql, (
                articles,
                affiliation_id,
            ))
            raw_result = list(Cursor.fetchone())
            if raw_result is None:
                continue
            affiliation_name = raw_result[0]
            average_citation_per_article = float(
                str(raw_result[1].quantize(Decimal('0.00'))))
            citation_count = int(str(raw_result[2]))
            publication_count = raw_result[3]
            start_year = raw_result[4]
            end_year = raw_result[5]
            available_download = raw_result[6]
            average_download_per_article = float(
                str(raw_result[7].quantize(Decimal('0.00'))))
            affiliation_info_list.append(
                (affiliation_id, affiliation_name,
                 average_citation_per_article, citation_count,
                 publication_count, start_year, end_year, available_download,
                 average_download_per_article, update_time, update_time))

    print("{} affiliation_info_list_len: {}".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
        len(affiliation_info_list)))

    for affiliation_infos in chunks(affiliation_info_list, 500):
        try:
            Cursor.executemany(update_sql, affiliation_infos)
            Connection.commit()
        except Exception as e:
            print(e)
            Connection.rollback()
        time.sleep(1)
    print("{} update_affiliation_database_job finished".format(
        time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))