def author_centrality(titles_to_authors):
    author_graph = digraph()
    author_graph.add_nodes(map(lambda x: u"title_%s" % x,
                               titles_to_authors.keys()))
    author_graph.add_nodes(list(set(
        [u'author_%s' % author[u'user'] for authors in
         titles_to_authors.values() for author in authors])))

    for title in titles_to_authors:
        log.debug(u"Working on title: %s" % title)
        for author in titles_to_authors[title]:
            try:
                author_graph.add_edge(
                    (u'title_%s' % title, u'author_%s' % author[u'user']))
            except AdditionError:
                pass

    centralities = dict([
        ('_'.join(item[0].split('_')[1:]), item[1]) for item in
        pagerank(author_graph).items() if item[0].startswith(u'author_')])

    centrality_scaler = MinMaxScaler(centralities.values())

    return dict([(cent_author, centrality_scaler.scale(cent_val))
                 for cent_author, cent_val in centralities.items()])
def get_title_top_authors(args, all_titles, all_revisions):
    pool = multiprocessing.Pool(processes=args.processes)
    title_top_authors = {}
    r = pool.map_async(
        get_contributing_authors_safe,
        [(title_obj, all_revisions.get(title_obj[u'title'], [])) for title_obj
         in all_titles],
        callback=title_top_authors.update)
    r.wait()
    if len(title_top_authors) == 0:
        log.info(u"No title top authors for wiki %s" % args.wiki_id)
        log.info(r.get())
        sys.exit(1)
    contribs = [author[u'contribs'] for title in title_top_authors for author
                in title_top_authors[title]]
    if len(contribs) == 0:
        log.info(u"No contributions for wiki %s" % args.wiki_id)
        sys.exit(1)
    contribs_scaler = MinMaxScaler(contribs)
    scaled_title_top_authors = {}
    for title, authors in title_top_authors.items():
        new_authors = []
        for author in authors:
            author[u'contribs'] = contribs_scaler.scale(author[u'contribs'])
            new_authors.append(author)
        scaled_title_top_authors[title] = new_authors
    return scaled_title_top_authors
示例#3
0
def main():
    use_caching()
    args = get_args()
    set_global_num_processes(args.num_processes)
    api_data = get_api_data(args.wiki_id)

    workbook = xlwt.Workbook()
    pages_sheet = workbook.add_sheet("Pages by Authority")
    pages_sheet.write(0, 0, "Page")
    pages_sheet.write(0, 1, "Authority")

    print "Getting Page Data..."
    page_authority = get_page_authority(api_data)

    print "Writing Page Data..."
    pages, authorities = zip(*page_authority)
    scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100)
    for i, page in enumerate(pages):
        if i > 65000:
            break
        pages_sheet.write(i+1, 0, page)
        pages_sheet.write(i+1, 1, scaler.scale(authorities[i]))

    print "Getting Author and Topic Data..."
    author_authority = get_author_authority(api_data)
    topic_authority = sorted(WikiTopicsToAuthorityService().get_value(args.wiki_id),
                             key=lambda y: y[1]['authority'], reverse=True)

    print "Writing Author Data..."
    authors_sheet = workbook.add_sheet("Authors by Authority")
    authors_sheet.write(0, 0, "Author")
    authors_sheet.write(0, 1, "Authority")

    authors_topics_sheet = workbook.add_sheet("Topics for Best Authors")
    authors_topics_sheet.write(0, 0, "Author")
    authors_topics_sheet.write(0, 1, "Topic")
    authors_topics_sheet.write(0, 2, "Rank")
    authors_topics_sheet.write(0, 3, "Score")

    # why is total_authority not there?
    all_total_authorities = [author.get('total_authority', 0) for author in author_authority]
    scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100)
    pivot_counter = 1
    for i, author in enumerate(author_authority):
        authors_sheet.write(i+1, 0, author['name'])
        authors_sheet.write(i+1, 1, scaler.scale(author['total_authority']))
        for rank, topic in enumerate(author['topics'][:10]):
            if pivot_counter > 65000:
                break
            authors_topics_sheet.write(pivot_counter, 0, author['name'])
            authors_topics_sheet.write(pivot_counter, 1, topic[0])
            authors_topics_sheet.write(pivot_counter, 2, rank+1)
            authors_topics_sheet.write(pivot_counter, 3, topic[1])
            pivot_counter += 1
        if i > 65000:
            break

    print "Writing Topic Data"
    topics_sheet = workbook.add_sheet("Topics by Authority")
    topics_sheet.write(0, 0, "Topic")
    topics_sheet.write(0, 1, "Authority")

    topics_authors_sheet = workbook.add_sheet("Authors for Best Topics")
    topics_authors_sheet.write(0, 0, "Topic")
    topics_authors_sheet.write(0, 1, "Author")
    topics_authors_sheet.write(0, 2, "Rank")
    topics_authors_sheet.write(0, 3, "Authority")

    scaler = MinMaxScaler([x[1].get('authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100)
    pivot_counter = 1
    for i, topic in enumerate(topic_authority):
        topics_sheet.write(i+1, 0, topic[0])
        topics_sheet.write(i+1, 1, scaler.scale(topic[1]['authority']))
        authors = topic[1]['authors']
        for rank, author in enumerate(authors[:10]):
            if pivot_counter > 65000:
                break
            topics_authors_sheet.write(pivot_counter, 0, topic[0])
            topics_authors_sheet.write(pivot_counter, 1, author['author'])
            topics_authors_sheet.write(pivot_counter, 2, rank+1)
            topics_authors_sheet.write(pivot_counter, 3, author['topic_authority'])
            pivot_counter += 1

        if i > 65000:
            break

    print "Saving to Excel"
    wiki_name = api_data['url'].replace('http://', '').replace('.wikia', '').replace('.com/', '')
    fname = "%s-%s-authority-data-%s.xls" % (args.wiki_id, wiki_name,
                                             datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M'))
    workbook.save(fname)

    if args.send_to_s3:
        bucket = connect_s3().get_bucket('nlp-data')
        k = bucket.new_key('authority/%s/%s' % (args.wiki_id, fname))
        k.set_contents_from_fiename(fname)

    print fname