def get_data(wid): log(wid) use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: log(wid, "no heads") if doc_ids_to_entities == {}: log(wid, "no entities") from_s3 = json.loads( bucket.get_key('feature-data/page-%s.json' % wid).get_contents_as_string()) for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, { 'titles': [], 'redirects': {} }) doc_ids_combined[doc_id] = (map( preprocess, entity_response['titles'] + entity_response['redirects'].keys() + entity_response['redirects'].values() + list(set(doc_ids_to_heads.get(doc_id, [])))) + from_s3.get(doc_id, [])) return doc_ids_combined.items()
def insert_entities(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False print u"Priming entity data on", args.wid for page, entity_data in wpe.items(): entity_list = map( my_escape, list( set( entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles')))) for i in range(0, len(entity_list), 50): cursor.execute(u""" INSERT IGNORE INTO topics (name) VALUES ("%s") """ % u'"), ("'.join(entity_list[i:i + 50])) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def insert_wiki_ids(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) print u"Inserting wiki data for", args.wid response = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': args.wid}) items = response.json().get(u'items') if not items: return False wiki_data = items[args.wid] cursor.execute(u""" INSERT INTO wikis (wiki_id, wam_score, title, url) VALUES (%s, %s, "%s", "%s") """ % (args.wid, str(wiki_data[u'wam_score']), my_escape(wiki_data[u'title']), wiki_data[u'url'])) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def insert_pages(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) authority_dict_fixed = get_authority_dict_fixed(args) if not authority_dict_fixed: return False print u"Inserting authority data for pages on wiki", args.wid dbargs = [] for doc_id in authority_dict_fixed: wiki_id, article_id = doc_id.split(u'_') dbargs.append((doc_id, article_id, wiki_id, str(authority_dict_fixed[doc_id]))) cursor.execute( u""" INSERT INTO articles (doc_id, article_id, wiki_id, local_authority) VALUES %s """ % u", ".join([u"""("%s", %s, %s, %s)""" % arg for arg in dbargs])) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def insert_wiki_ids(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) print u"Inserting wiki data for", args.wid response = requests.get(u"http://www.wikia.com/api/v1/Wikis/Details", params={u"ids": args.wid}) items = response.json().get(u"items") if not items: return False wiki_data = items[args.wid] cursor.execute( u""" INSERT INTO wikis (wiki_id, wam_score, title, url) VALUES (%s, %s, "%s", "%s") """ % (args.wid, str(wiki_data[u"wam_score"]), my_escape(wiki_data[u"title"]), wiki_data[u"url"]) ) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def insert_pages(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) authority_dict_fixed = get_authority_dict_fixed(args) if not authority_dict_fixed: return False print u"Inserting authority data for pages on wiki", args.wid dbargs = [] for doc_id in authority_dict_fixed: wiki_id, article_id = doc_id.split(u"_") dbargs.append((doc_id, article_id, wiki_id, str(authority_dict_fixed[doc_id]))) cursor.execute( u""" INSERT INTO articles (doc_id, article_id, wiki_id, local_authority) VALUES %s """ % u", ".join([u"""("%s", %s, %s, %s)""" % arg for arg in dbargs]) ) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def insert_entities(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False print u"Priming entity data on", args.wid for page, entity_data in wpe.items(): entity_list = map( my_escape, list(set(entity_data.get(u"redirects", {}).values() + entity_data.get(u"titles"))) ) for i in range(0, len(entity_list), 50): cursor.execute( u""" INSERT IGNORE INTO topics (name) VALUES ("%s") """ % u'"), ("'.join(entity_list[i : i + 50]) ) db.commit() return args except Exception as e: print e, traceback.format_exc() return False
def get_data(wiki_id): use_caching(per_service_cache={'TopEntitiesService.get': {'dont_compute': True}, 'HeadsCountService.get': {'dont_compute': True}}) hcs = HeadsCountService().get_value(wiki_id) tes = TopEntitiesService().get_value(wiki_id) if type(hcs) == dict: hcs = hcs.items() if type(tes) == dict: tes = tes.items() return wiki_id, {'heads': sorted(hcs, key=lambda y: y[1], reverse=True)[:50], 'entities': sorted(tes, key=lambda y: y[1], reverse=True)}
def get_data_doc(doc_id): use_caching() # should be CombinedEntitiesService yo heads = HeadsService().get_value(doc_id, {}) entities = EntitiesService().get_value(doc_id, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print doc_id, "no heads" if doc_ids_to_entities == {}: print doc_id, "no entities" return entities["titles"].values() + entities["redirects"].keys() + entities["redirects"].values()
def get_data_doc(doc_id): use_caching() #should be CombinedEntitiesService yo heads = HeadsService().get_value(doc_id, {}) entities = EntitiesService().get_value(doc_id, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print doc_id, "no heads" if doc_ids_to_entities == {}: print doc_id, "no entities" return entities['titles'].values() + entities['redirects'].keys( ) + entities['redirects'].values()
def main(): global wiki_id sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) args, _ = get_args() wiki_id = args.wiki_id services = args.services.split(',') caching_dict = dict([(service+'.get', {'write_only': True}) for service in services]) use_caching(per_service_cache=caching_dict) log('Calling wiki-level services on %s' % args.wiki_id) #pool = Pool(processes=8) #s = pool.map_async(get_service, services) #s.wait() for service in services: get_service(service)
def main(): global wiki_id sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) args, _ = get_args() wiki_id = args.wiki_id services = args.services.split(',') caching_dict = dict([(service + '.get', { 'write_only': True }) for service in services]) use_caching(per_service_cache=caching_dict) log('Calling wiki-level services on %s' % args.wiki_id) #pool = Pool(processes=8) #s = pool.map_async(get_service, services) #s.wait() for service in services: get_service(service)
def get_data(wiki_id): use_caching( per_service_cache={ 'TopEntitiesService.get': { 'dont_compute': True }, 'HeadsCountService.get': { 'dont_compute': True } }) hcs = HeadsCountService().get_value(wiki_id) tes = TopEntitiesService().get_value(wiki_id) if type(hcs) == dict: hcs = hcs.items() if type(tes) == dict: tes = tes.items() return wiki_id, { 'heads': sorted(hcs, key=lambda y: y[1], reverse=True)[:50], 'entities': sorted(tes, key=lambda y: y[1], reverse=True) }
def get_data_wid(wid): print wid use_caching(shouldnt_compute=True) # should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print wid, "no heads" if doc_ids_to_entities == {}: print wid, "no entities" for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, {"titles": [], "redirects": {}}) doc_ids_combined[doc_id] = map( preprocess, entity_response["titles"] + entity_response["redirects"].keys() + entity_response["redirects"].values() + list(set(doc_ids_to_heads.get(doc_id, []))), ) return doc_ids_combined.items()
def get_data_wid(wid): print wid use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print wid, "no heads" if doc_ids_to_entities == {}: print wid, "no entities" for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, { 'titles': [], 'redirects': {} }) doc_ids_combined[doc_id] = map( preprocess, entity_response['titles'] + entity_response['redirects'].keys() + entity_response['redirects'].values() + list(set(doc_ids_to_heads.get(doc_id, [])))) return doc_ids_combined.items()
def get_data(wid): log(wid) use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: log(wid, "no heads") if doc_ids_to_entities == {}: log(wid, "no entities") from_s3 = json.loads(bucket.get_key( 'feature-data/page-%s.json' % wid).get_contents_as_string()) for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get( doc_id, {'titles': [], 'redirects': {}}) doc_ids_combined[doc_id] = (map(preprocess, entity_response['titles'] + entity_response['redirects'].keys() + entity_response['redirects'].values() + list(set(doc_ids_to_heads.get(doc_id, [])))) + from_s3.get(doc_id, [])) return doc_ids_combined.items()
from nlp_services.caching import use_caching from nlp_services.syntax import WikiToPageHeadsService from nlp_services.discourse.entities import WikiPageToEntitiesService from pprint import pprint use_caching(shouldnt_compute=True) def heads(wid): #pprint(WikiToPageHeadsService().get_value(wid, {})) return WikiToPageHeadsService().get_value(wid, {}) def entities(wid): #pprint(WikiPageToEntitiesService().get_value(wid, {})) return WikiPageToEntitiesService().get_value(wid, {})
def insert_contrib_data(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False authority_dict_fixed = get_authority_dict_fixed(args) if not authority_dict_fixed: return False print u"Inserting page and author and contrib data for wiki", args.wid for doc_id in authority_dict_fixed: wiki_id, article_id = doc_id.split(u'_') entity_data = wpe.get(doc_id, {}) entity_list = filter( lambda x: x, map( lambda x: x.strip(), map( my_escape, list( set( entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles', [])))))) cursor.execute(u""" SELECT topic_id FROM topics WHERE name IN ("%s") """ % (u'", "'.join(entity_list))) topic_ids = list(set([result[0] for result in cursor.fetchall()])) for topic_id in topic_ids: sql = u""" INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s) """ % (article_id, wiki_id, topic_id) cursor.execute(sql) db.commit() cursor = db.cursor() for contribs in PageAuthorityService().get_value(doc_id, []): cursor.execute(u""" INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s") """ % (contribs[u'userid'], my_escape(contribs[u'user']))) db.commit() cursor.execute(u""" INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s) """ % (article_id, wiki_id, contribs[u'userid'], contribs[u'contribs'])) db.commit() local_authority = contribs[ u'contribs'] * authority_dict_fixed.get(doc_id, 0) for topic_id in topic_ids: cursor.execute(u""" INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s) ON DUPLICATE KEY UPDATE local_authority = local_authority + %s """ % (contribs[u'userid'], topic_id, local_authority, local_authority)) db.commit() db.commit() print u"Done with", args.wid return args except Exception as e: print e, traceback.format_exc() return False
def insert_contrib_data(args): try: use_caching(is_read_only=True, shouldnt_compute=True) db, cursor = get_db_and_cursor(args) wpe = WikiPageToEntitiesService().get_value(args.wid) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", args.wid return False authority_dict_fixed = get_authority_dict_fixed(args) if not authority_dict_fixed: return False print u"Inserting page and author and contrib data for wiki", args.wid for doc_id in authority_dict_fixed: wiki_id, article_id = doc_id.split(u"_") entity_data = wpe.get(doc_id, {}) entity_list = filter( lambda x: x, map( lambda x: x.strip(), map( my_escape, list(set(entity_data.get(u"redirects", {}).values() + entity_data.get(u"titles", []))), ), ), ) cursor.execute( u""" SELECT topic_id FROM topics WHERE name IN ("%s") """ % (u'", "'.join(entity_list)) ) topic_ids = list(set([result[0] for result in cursor.fetchall()])) for topic_id in topic_ids: sql = u""" INSERT IGNORE INTO articles_topics (article_id, wiki_id, topic_id) VALUES (%s, %s, %s) """ % ( article_id, wiki_id, topic_id, ) cursor.execute(sql) db.commit() cursor = db.cursor() for contribs in PageAuthorityService().get_value(doc_id, []): cursor.execute( u""" INSERT IGNORE INTO users (user_id, user_name) VALUES (%d, "%s") """ % (contribs[u"userid"], my_escape(contribs[u"user"])) ) db.commit() cursor.execute( u""" INSERT INTO articles_users (article_id, wiki_id, user_id, contribs) VALUES (%s, %s, %d, %s) """ % (article_id, wiki_id, contribs[u"userid"], contribs[u"contribs"]) ) db.commit() local_authority = contribs[u"contribs"] * authority_dict_fixed.get(doc_id, 0) for topic_id in topic_ids: cursor.execute( u""" INSERT INTO topics_users (user_id, topic_id, local_authority) VALUES (%d, %s, %s) ON DUPLICATE KEY UPDATE local_authority = local_authority + %s """ % (contribs[u"userid"], topic_id, local_authority, local_authority) ) db.commit() db.commit() print u"Done with", args.wid return args except Exception as e: print e, traceback.format_exc() return False
def main(): sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) use_caching() args = get_args() get_model_from_args(args) log("Done")
def main(): args, _ = get_args() use_caching(per_service_cache=dict([(service + '.get', { 'write_only': True }) for service in args.services.split(',')])) call_services(args)
parser = argparse.ArgumentParser(description="Generate a per-page topic model using latent dirichlet analysis.") parser.add_argument( "--wiki_ids", dest="wiki_ids_file", nargs="?", type=argparse.FileType("r"), help="The source file of wiki IDs sorted by WAM", ) parser.add_argument("--num_wikis", dest="num_wikis", type=int, action="store", help="The number of wikis to process") parser.add_argument( "--num_topics", dest="num_topics", type=int, action="store", help="The number of topics for the model to use" ) parser.add_argument("--model_dest", dest="model_dest", type=str, action="store", help="Where to save the model") args = parser.parse_args() use_caching() def get_data_wid(wid): print wid use_caching(shouldnt_compute=True) # should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print wid, "no heads" if doc_ids_to_entities == {}: print wid, "no entities" for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, {"titles": [], "redirects": {}})
def main(): use_caching() args = get_args() set_global_num_processes(args.num_processes) api_data = get_api_data(args.wiki_id) workbook = xlwt.Workbook() pages_sheet = workbook.add_sheet("Pages by Authority") pages_sheet.write(0, 0, "Page") pages_sheet.write(0, 1, "Authority") print "Getting Page Data..." page_authority = get_page_authority(api_data) print "Writing Page Data..." pages, authorities = zip(*page_authority) scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100) for i, page in enumerate(pages): if i > 65000: break pages_sheet.write(i+1, 0, page) pages_sheet.write(i+1, 1, scaler.scale(authorities[i])) print "Getting Author and Topic Data..." author_authority = get_author_authority(api_data) topic_authority = sorted(WikiTopicsToAuthorityService().get_value(args.wiki_id), key=lambda y: y[1]['authority'], reverse=True) print "Writing Author Data..." authors_sheet = workbook.add_sheet("Authors by Authority") authors_sheet.write(0, 0, "Author") authors_sheet.write(0, 1, "Authority") authors_topics_sheet = workbook.add_sheet("Topics for Best Authors") authors_topics_sheet.write(0, 0, "Author") authors_topics_sheet.write(0, 1, "Topic") authors_topics_sheet.write(0, 2, "Rank") authors_topics_sheet.write(0, 3, "Score") # why is total_authority not there? all_total_authorities = [author.get('total_authority', 0) for author in author_authority] scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100) pivot_counter = 1 for i, author in enumerate(author_authority): authors_sheet.write(i+1, 0, author['name']) authors_sheet.write(i+1, 1, scaler.scale(author['total_authority'])) for rank, topic in enumerate(author['topics'][:10]): if pivot_counter > 65000: break authors_topics_sheet.write(pivot_counter, 0, author['name']) authors_topics_sheet.write(pivot_counter, 1, topic[0]) authors_topics_sheet.write(pivot_counter, 2, rank+1) authors_topics_sheet.write(pivot_counter, 3, topic[1]) pivot_counter += 1 if i > 65000: break print "Writing Topic Data" topics_sheet = workbook.add_sheet("Topics by Authority") topics_sheet.write(0, 0, "Topic") topics_sheet.write(0, 1, "Authority") topics_authors_sheet = workbook.add_sheet("Authors for Best Topics") topics_authors_sheet.write(0, 0, "Topic") topics_authors_sheet.write(0, 1, "Author") topics_authors_sheet.write(0, 2, "Rank") topics_authors_sheet.write(0, 3, "Authority") scaler = MinMaxScaler([x[1].get('authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100) pivot_counter = 1 for i, topic in enumerate(topic_authority): topics_sheet.write(i+1, 0, topic[0]) topics_sheet.write(i+1, 1, scaler.scale(topic[1]['authority'])) authors = topic[1]['authors'] for rank, author in enumerate(authors[:10]): if pivot_counter > 65000: break topics_authors_sheet.write(pivot_counter, 0, topic[0]) topics_authors_sheet.write(pivot_counter, 1, author['author']) topics_authors_sheet.write(pivot_counter, 2, rank+1) topics_authors_sheet.write(pivot_counter, 3, author['topic_authority']) pivot_counter += 1 if i > 65000: break print "Saving to Excel" wiki_name = api_data['url'].replace('http://', '').replace('.wikia', '').replace('.com/', '') fname = "%s-%s-authority-data-%s.xls" % (args.wiki_id, wiki_name, datetime.strftime(datetime.now(), '%Y-%m-%d-%H-%M')) workbook.save(fname) if args.send_to_s3: bucket = connect_s3().get_bucket('nlp-data') k = bucket.new_key('authority/%s/%s' % (args.wiki_id, fname)) k.set_contents_from_fiename(fname) print fname
def get_workbook(self, num_processes=8): use_caching() set_global_num_processes(num_processes) workbook = xlwt.Workbook() pages_sheet = workbook.add_sheet(u"Pages by Authority") pages_sheet.write(0, 0, u"Page") pages_sheet.write(0, 1, u"Authority") page_authority = self.get_all_pages() pages, authorities = zip(*page_authority) scaler = MinMaxScaler(authorities, enforced_min=0, enforced_max=100) for i, page in enumerate(pages): if i > 65000: break pages_sheet.write(i+1, 0, page) pages_sheet.write(i+1, 1, scaler.scale(authorities[i])) author_authority = self.get_all_authors().values() for counter, author in enumerate(author_authority): author[u'topics'] = [topic.topic for topic in UserModel(author, self.args).get_topics_for_wiki(self.wiki_id, limit=5)] if counter > 25: break topic_authority = self.get_topics(limit=None) for counter, topic in enumerate(topic_authority): topic[u'authors'] = TopicModel(topic[u'topic'], self.args).get_users(5, for_api=True) if counter > 25: break authors_sheet = workbook.add_sheet(u"Authors by Authority") authors_sheet.write(0, 0, u"Author") authors_sheet.write(0, 1, u"Authority") authors_topics_sheet = workbook.add_sheet(u"Topics for Best Authors") authors_topics_sheet.write(0, 0, u"Author") authors_topics_sheet.write(0, 1, u"Topic") authors_topics_sheet.write(0, 2, u"Rank") authors_topics_sheet.write(0, 3, u"Score") # why is total_authority not there? all_total_authorities = [author.get(u'total_authority', 0) for author in author_authority] scaler = MinMaxScaler(all_total_authorities, enforced_min=0, enforced_max=100) pivot_counter = 1 for i, author in enumerate(author_authority): print author authors_sheet.write(i+1, 0, author[u'name']) authors_sheet.write(i+1, 1, scaler.scale(author[u'total_authority'])) for rank, topic in enumerate(author.get(u'topics', [])[:10]): if pivot_counter > 65000: break authors_topics_sheet.write(pivot_counter, 0, author[u'name']) authors_topics_sheet.write(pivot_counter, 1, topic[0]) authors_topics_sheet.write(pivot_counter, 2, rank+1) authors_topics_sheet.write(pivot_counter, 3, topic[1]) pivot_counter += 1 if i > 65000: break topics_sheet = workbook.add_sheet(u"Topics by Authority") topics_sheet.write(0, 0, u"Topic") topics_sheet.write(0, 1, u"Authority") topics_authors_sheet = workbook.add_sheet(u"Authors for Best Topics") topics_authors_sheet.write(0, 0, u"Topic") topics_authors_sheet.write(0, 1, u"Author") topics_authors_sheet.write(0, 2, u"Rank") topics_authors_sheet.write(0, 3, u"Authority") scaler = MinMaxScaler([x[1].get(u'authority', 0) for x in topic_authority], enforced_min=0, enforced_max=100) pivot_counter = 1 for i, topic in enumerate(topic_authority): topics_sheet.write(i+1, 0, topic[0]) topics_sheet.write(i+1, 1, scaler.scale(topic[1][u'authority'])) authors = topic[1][u'authors'] for rank, author in enumerate(authors[:10]): if pivot_counter > 65000: break topics_authors_sheet.write(pivot_counter, 0, topic[0]) topics_authors_sheet.write(pivot_counter, 1, author[u'author']) topics_authors_sheet.write(pivot_counter, 2, rank+1) topics_authors_sheet.write(pivot_counter, 3, author[u'topic_authority']) pivot_counter += 1 if i > 65000: break return workbook
from boto.s3.key import Key from boto.exception import S3ResponseError import traceback import boto import sys import re import json import time import random BUCKET = boto.connect_s3().get_bucket('nlp-data') service_file = sys.argv[2] if len(sys.argv) > 2 else 'services-config.json' SERVICES = json.loads(open(service_file).read())['services'] use_caching(per_service_cache=dict([(service+'.get', {'write_only': True}) for service in SERVICES])) def process_file(filename): if filename.strip() == '': return # newline at end of file global SERVICES match = re.search('([0-9]+)/([0-9]+)', filename) if match is None: print "No match for %s" % filename return doc_id = '%s_%s' % (match.group(1), match.group(2)) for service in SERVICES: try: getattr(sys.modules[__name__], service)().get(doc_id) except KeyboardInterrupt:
from boto import connect_s3 from nlp_services.caching import use_caching from nlp_services.syntax import WikiToPageHeadsService, HeadsCountService, TopHeadsService from nlp_services.discourse.entities import WikiEntitiesService, WpWikiEntitiesService, CombinedWikiEntitiesService, TopEntitiesService, WpTopEntitiesService, CombinedTopEntitiesService, WikiPageEntitiesService, WpWikiPageEntitiesService, CombinedWikiPageEntitiesService, EntityDocumentCountsService, WpEntityDocumentCountsService, CombinedDocumentEntityCountsService, WikiPageToEntitiesService, WpPageToEntitiesService, CombinedPageToEntitiesService from nlp_services.discourse import AllEntitiesSentimentAndCountsService from nlp_services.discourse.sentiment import WikiEntitySentimentService, WpWikiEntitySentimentService from title_confirmation.wikia import AllTitlesService, RedirectsService sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) #todo argparse service_file = 'services-config.json' SERVICES = json.loads(open(service_file).read())['wiki-services'] caching_dict = dict([(service+'.get', {'write_only': True}) for service in SERVICES]) use_caching(per_service_cache=caching_dict) wid = sys.argv[1] try: for service in SERVICES: try: print service getattr(sys.modules[__name__], service)().get(wid) caching_dict[service+'.get'] = {'dont_compute': True} # DRY fool! use_caching(per_service_caching=caching_dict) except KeyboardInterrupt: sys.exit() except Exception as e: print 'Could not call %s on %s!' % (service, wid) print traceback.format_exc() except:
type=int, action='store', help="The number of wikis to process") parser.add_argument('--num_topics', dest='num_topics', type=int, action='store', help="The number of topics for the model to use") parser.add_argument('--model_dest', dest='model_dest', type=str, action='store', help="Where to save the model") args = parser.parse_args() use_caching() def get_data_wid(wid): print wid use_caching(shouldnt_compute=True) #should be CombinedEntitiesService yo doc_ids_to_heads = WikiToPageHeadsService().get_value(wid, {}) doc_ids_to_entities = WikiPageToEntitiesService().get_value(wid, {}) doc_ids_combined = {} if doc_ids_to_heads == {}: print wid, "no heads" if doc_ids_to_entities == {}: print wid, "no entities" for doc_id in doc_ids_to_heads: entity_response = doc_ids_to_entities.get(doc_id, {
def ingest_data(wiki_id): """ Create Solr documents for a given wiki ID :param wiki_id: the ID of the wiki (int or str) :type wiki_id: int :return: """ # make sure all pages and all user pages exists solr.existing_collection(solr.all_pages_collection()) solr.existing_collection(solr.all_user_pages_collection()) resp = requests.get(u'http://www.wikia.com/api/v1/Wikis/Details', params={u'ids': wiki_id}) items = resp.json()['items'] if wiki_id not in items: print u"Wiki doesn't exist?" return api_data = items[wiki_id] wiki_data = { 'id': api_data['id'], 'wam_f': {'set': api_data['wam_score']}, 'title_s': {'set': api_data['title']}, 'attr_title': {'set': api_data['title']}, 'attr_desc': {'set': api_data['desc']} } for key in api_data['stats'].keys(): wiki_data['%s_i' % key] = {'set': api_data['stats'][key]} wiki_api_data = requests.get(u'%swikia.php' % (api_data[u'url']), params={u'method': u'getForWiki', u'service': u'CrossWikiCore', u'controller': u'WikiaSearchIndexerController'}).json()[u'contents'] wiki_data[u'hub_s'] = wiki_api_data[u'hub_s'] # easier api_data[u'hub_s'] = wiki_api_data[u'hub_s'] collection = solr.existing_collection(solr.collection_for_wiki(wiki_id)) use_caching(is_read_only=True, shouldnt_compute=True) wpe = WikiPageToEntitiesService().get_value(wiki_id) if not wpe: print u"NO WIKI PAGE TO ENTITIES SERVICE FOR", wiki_id return False documents = [] grouped_futures = [] pages_to_authority = WikiAuthorityService().get_value(str(wiki_data['id'])) for counter, (doc_id, entity_data) in enumerate(wpe.items()): documents.append({ 'id': doc_id, 'attr_entities': {'set': list(set(entity_data.get(u'redirects', {}).values() + entity_data.get(u'titles')))}, 'type_s': {'set': 'Page'}, 'authority_f': {'set': pages_to_authority.get(doc_id, 0)}, 'hub_s': wiki_api_data['hub_s'] }) if counter != 0 and counter % 1500 == 0: grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) documents = [] grouped_futures.append( group(add_with_metadata.s(api_data, grouping) for grouping in iter_grouper(15, documents))() ) # block on completion of all grouped futures completed = 0 total = 0 while len(filter(lambda x: not x.ready(), grouped_futures)) > 1: new_completed = 0 new_total = 0 for future in grouped_futures: new_completed += future.completed_count() new_total += len(future.results) if completed != new_completed or total != new_total: completed = new_completed total = new_total print "Grouped Tasks: (%d/%d)" % (completed, total) sleep(2) all_user_tuples = [] for future in grouped_futures: result = get_with_backoff(future, []) map(all_user_tuples.extend, result) all_user_tuples = list(set(all_user_tuples)) if not all_user_tuples: print "Empty user tuples, bailing" return # assign the unique user ids to the first variable, and the unique usernames to the second all_user_ids, all_users = zip(*all_user_tuples) collection.commit() solr.all_pages_collection().commit() solr.all_user_pages_collection().commit() wiki_data['attr_entities'] = {'set': []} for count, entities in WikiEntitiesService().get_value(str(wiki_id)).items(): for entity in entities: map(wiki_data['attr_entities']['set'].append, [entity] * int(count)) # goddamnit count isn't int wiki_data['user_ids_is'] = {'set': all_user_ids} wiki_data['attr_users'] = {'set': all_users} wiki_data['total_authority_f'] = {'set': sum(pages_to_authority.values())} wiki_data['authorities_fs'] = {'set': pages_to_authority.values()} wiki_collection = solr.existing_collection(solr.global_collection()) wiki_collection.add([wiki_data]) wiki_collection.commit() print "Committed wiki data" print "Retrieving user docs..." futures = group(build_wiki_user_doc.s(api_data, user_tuple) for user_tuple in all_user_tuples)() future_result_len = len(futures.results) while not futures.ready(): print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) user_docs = get_with_backoff(futures, []) if not user_docs: print "User docs was empty. Possibly connection problems." return authority_scaler = MinMaxScaler([doc['total_page_authority_f']['set'] for doc in user_docs]) contribs_scaler = MinMaxScaler([doc['total_contribs_f']['set'] for doc in user_docs]) for doc in user_docs: scaled_authority = authority_scaler.scale(doc['total_page_authority_f']['set']) scaled_contribs = contribs_scaler.scale(doc['total_contribs_f']['set']) doc['scaled_authority_f'] = {'set': scaled_authority} doc['scaled_contribs_f'] = {'set': scaled_contribs} doc['scaled_contribs_authority_f'] = {'set': scaled_authority * scaled_contribs} wiki_user_collection = solr.existing_collection(solr.wiki_user_collection()) wiki_user_collection.add(user_docs) wiki_user_collection.commit() print "Analyzing topics" futures = group(get_wiki_topic_doc.s(wiki_data['id'], topic) for topic in list(set(wiki_data['attr_entities']['set'])))() future_result_len = len(futures.results) counter = 0 while not futures.ready(): if counter % 5 == 0: print "Progress: (%d/%d)" % (futures.completed_count(), future_result_len) sleep(2) counter += 1 topic_docs = get_with_backoff(futures, []) if not topic_docs: print "No topics, probably a connection error" return collection.add(topic_docs) collection.commit() topic_collection = solr.existing_collection(solr.all_topics_collection()) topic_collection.add(topic_docs) topic_collection.commit()
def main(): args, _ = get_args() use_caching( per_service_cache=dict([(service + ".get", {"write_only": True}) for service in args.services.split(",")]) ) call_services(args)