def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) local_output_root = kwargs.get('local-output-root', None) page_topics_pickle = kwargs['page-topic-pickle'] page_topics = pickle.load(open(page_topics_pickle, 'rb')) _pt_dict = {} for p in page_topics: _pt_dict.update({p[0]: p[1]}) pt_dict = sc.broadcast(_pt_dict) pages_other = sc.accumulator([], ListParam()) author_entity = sc.pickleFile(input_path) author_topic = author_entity.map( lambda x: entities_to_topics(x, pt_dict, pages_other)) print('pages_other:') pp.pprint(pages_other.value) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_topic.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path( hdfs_root, kwargs.get('input-job', None), kwargs.get( 'input-path', None)) # /user/username/data/output/_jobs/author_subreddit/latest subreddit_category_pickle = kwargs['subreddit-category-pickle'] subreddit_df_pickle = kwargs['subreddit-df-pickle'] subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb')) _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb')) subreddit_df = sc.broadcast(_subreddit_df) # create vectorizer for each category out of subreddit_df dataset subreddits_grouped_by_categories = subreddits_to_categories( _subreddit_df, subreddit_category) _vectorizers = {} for k, v in subreddits_grouped_by_categories.items(): dv = DictVectorizer() dv.fit_transform(v) _vectorizers[k] = dv vectorizers = sc.broadcast(_vectorizers) data = sc.pickleFile( input_path) # /user/username/data/output/author_category_vec/latest authors_total = data.count() author_category = data.map(lambda x: get_feature_vectors( x, subreddit_df, authors_total, vectorizers)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) botlist_csv = kwargs['botlist-csv'] sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True) import nltk with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) print("input_path: " + input_path) file = sc.textFile( input_path) # /user/username/data/output/sub_com_threads threads = file.map(lambda l: json.loads(l)) pairs = threads.map(lambda x: top_level(x, botlist)).filter(lambda x: x) pickle.dump( pairs.collect(), open( '/home/username/data/output/_jobs/pairs_winargs_malleability.pickle', 'wb')) pairs_json = pairs.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) botlist_csv = kwargs['botlist-csv'] with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) print("input_path: " + input_path) file = sc.textFile(input_path) threads = file.map(lambda l: json.loads(l)) pairs = threads.flatMap(lambda x: top_level(x)) pairs = pairs.filter(lambda x: x[0]['author'] not in botlist.value and x[1] ['author'] not in botlist.value) ### - filter out pairs where sub['selftext'] or com['body'] empty ### - OR author == '[removed]/[deleted]' pairs_json = pairs.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) pairs_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) file = sc.textFile( input_path) # /user/username/data/output/author_subreddits/latest data = file.map(lambda l: json.loads(l)) flattened = data.flatMap(lambda x: [y for y in x['subreddits']]) reduced = flattened.map(lambda x: (x['subreddit'], 1)).reduceByKey( lambda a, b: a + b) mapped = reduced.map(lambda x: {"subreddit": x[0], "df": x[1]}) subreddit_df_json = mapped.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) subreddit_df_json.saveAsTextFile(output_path) print('Saving to {0}...'.format(local_output_root + '/subreddit_df.pickle')) collected = mapped.collect() _df_dict = {} for s in collected: _df_dict.update({s['subreddit']: s['df']}) pickle.dump(_df_dict, open(local_output_root + '/subreddit_df.pickle', 'wb'))
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) data = sc.pickleFile( input_path) # /user/username/data/output/author_category/latest flattened = data.flatMap(lambda x: [k for k, v in x['categories'].items()]) print(flattened, flattened.count()) reduced = flattened.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b) print(reduced, reduced.count()) mapped = reduced.map(lambda x: {"category": x[0], "df": x[1]}) print(mapped, mapped.count()) data_json = mapped.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) print(data_json, data_json.count()) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) data_json.saveAsTextFile(output_path) print('Saving to {0}...'.format(local_output_root + '/category_df.pickle')) collected = mapped.collect() _df_dict = {} for s in collected: _df_dict.update({s['category']: s['df']}) pickle.dump(_df_dict, open(local_output_root + '/category_df.pickle', 'wb'))
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] botlist_csv = kwargs['botlist-csv'] subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True) with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) subs_file = sc.textFile(subs_path) subs_data = subs_file.map(lambda l: jsonloads(l)).filter( lambda l: l is not "" and 'author' in l and 'selftext' in l) coms_file = sc.textFile(coms_path) coms_data = coms_file.map(lambda l: jsonloads(l)).filter( lambda l: l is not "" and 'author' in l and 'body' in l) print("\n#Filtering sub/com...") subs_data = subs_data.filter( lambda x: x['author'] not in botlist.value and len(x[ "selftext"]) > 0 and x["selftext"] != "[removed]") coms_data = coms_data.filter( lambda x: x['author'] not in botlist.value and len(x[ "body"]) > 0 and x["body"] != "[removed]") cmv_authors = pickle.load( open('/home/username/data/output/cmv_authors.pickle', 'rb')) cmv_authors_b = sc.broadcast(cmv_authors) subs_data = subs_data.filter(lambda x: x['author'] in cmv_authors_b.value) coms_data = subs_data.filter(lambda x: x['author'] in cmv_authors_b.value) print("\n#Running nltk...") subs_sentences = subs_data.map(lambda x: get_sentences(x, 'selftext')).map( lambda x: (x['author'], x['sentences'])) coms_sentences = coms_data.map(lambda x: get_sentences(x, 'body')).map( lambda x: (x['author'], x['sentences'])) print(subs_sentences.take(1)) print("\n#Union, flatten, reduce...") united = sc.union([subs_sentences, coms_sentences]) print(united.take(1)) sentences_reduced = united.reduceByKey(lambda a, b: a + b) print(sentences_reduced.take(1)) print("\n#Saving results...") output_json = sentences_reduced.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) output_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] input_path = utils.get_input_path( hdfs_root, kwargs.get('input-job', None), kwargs.get( 'input-path', None)) # /user/username/data/output/_jobs/author_subreddit/latest local_output_root = kwargs.get('local-output-root', None) subreddit_category_pickle = kwargs['subreddit-category-pickle'] subreddit_category = pickle.load(open(subreddit_category_pickle, 'rb')) author_subreddit = sc.pickleFile( input_path) # /user/username/data/output/author_subreddit_vec/latest author_category = author_subreddit.map( lambda x: subreddits_to_categories(x, subreddit_category)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) category_df_pickle = kwargs['category-df-pickle'] _category_df = pickle.load(open(local_output_root+'/category_df.pickle', 'rb')) category_df = sc.broadcast(_category_df) _dv = DictVectorizer() features_vec = _dv.fit_transform(_category_df) dv = sc.broadcast(_dv) data = sc.pickleFile(input_path) # /user/username/data/output/author_category/latest authors_total = data.count() author_category_features = data.map(lambda x: get_feature_vectors(x, authors_total, category_df, dv)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_category_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs.get('local-output-root', None) input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) subreddit_df_pickle = kwargs['subreddit-df-pickle'] _subreddit_df = pickle.load(open(subreddit_df_pickle, 'rb')) subreddit_df = sc.broadcast(_subreddit_df) _dv = DictVectorizer() features_vec = _dv.fit_transform(_subreddit_df) dv = sc.broadcast(_dv) file = sc.textFile(input_path) data = file.map(lambda l: json.loads(l) ) authors_total = data.count() author_subreddit_features = data.map(lambda x: get_feature_vectors(x, authors_total, subreddit_df, dv)) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) # output_folder='/tf_squared' author_subreddit_features.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] botlist_csv = kwargs['botlist-csv'] stopwords_csv = kwargs['stopwords-csv'] # ranksnl_stopwords.csv sem_model_path = kwargs['sem-model-path'] sem_path = kwargs['sem-path'] subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] sc.addFile(sem_model_path) sc.addPyFile(sem_path) sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True) print("\n#sematicize():") sys.path.insert(0, SparkFiles.get(sem_path.split('/')[-1])) from semanticizest import Semanticizer print("\n#loading model...") _sem = Semanticizer(SparkFiles.get(sem_model_path.split('/')[-1])) print("\n#model loaded.") sem = sc.broadcast(_sem) with open(stopwords_csv, 'r') as f: reader = csv.reader(f) _stopwords = list(map(lambda x: x[0], list(reader))) stopwords = sc.broadcast(_stopwords) with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) subs_file = sc.textFile(subs_path) subs_data = subs_file.map(lambda l: json.loads(l)) coms_file = sc.textFile(coms_path) coms_data = coms_file.map(lambda l: json.loads(l)) print("\n#Filtering sub/com...") subs_data = subs_data.filter( lambda x: x['author'] not in botlist.value and len(x[ "selftext"]) > 0 and x["selftext"] != "[removed]") coms_data = coms_data.filter( lambda x: x['author'] not in botlist.value and len(x[ "body"]) > 0 and x["body"] != "[removed]") print("\n#Running nltk...") subs_sentences = subs_data.map(lambda x: get_sentences(x, 'selftext')) coms_sentences = coms_data.map(lambda x: get_sentences(x, 'body')) print("\n#Running wikifier...") subs_entities = subs_sentences.map(lambda x: semanticize(x, sem)) coms_entities = coms_sentences.map(lambda x: semanticize(x, sem)) print("\n#Union, flatten, reduce...") united = sc.union([subs_entities, coms_entities]) entities_flat = united.flatMap( lambda x: [((x['author'], e), 1) for e in x['entities']]) print(entities_flat.take(1)) # [(('Toby-OrNotToby', ('HMV', 0.2189578713968958, 'Q10854572', 1144829, 'HMV')), 1)] entities_reduced = entities_flat.reduceByKey(lambda a, b: a + b) print(entities_reduced.take(1)) # [(('disinterestedMarmot', ('Solutions of the Einstein field equations', 5.930494603249911e-05, 'Q4394061', 2001621, 'solution')), 9)] entities_grouped_by_author = entities_reduced.map( lambda x: (x[0][0], (x[0][1], x[1]))).groupByKey().mapValues(list) print(entities_grouped_by_author.take(1)) print("\n#Saving results...") output_json = entities_grouped_by_author.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) output_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) page_topics = pickle.load(open(local_output_root+'/page_topics.pickle', 'rb')) _entity_df = pickle.load(open(local_output_root+'/entity_df.pickle', 'rb')) entity_df = sc.broadcast(_entity_df) topics_df = {'Academic disciplines': 169126, 'Arts': 165790, 'Business': 165670, 'Concepts': 159671, 'Culture': 169696, 'Education': 162128, 'Entertainment': 166557, 'Events': 157631, 'Geography': 164197, 'Health': 168352, 'History': 166707, 'Humanities': 169517, 'Language': 168451, 'Law': 163853, 'Life': 167678, 'Mathematics': 157341, 'Nature': 167276, 'Other': 129536, 'People': 144695, 'Philosophy': 163002, 'Politics': 167504, 'Reference': 157377, 'Religion': 161830, 'Science': 167156, 'Society': 170080, 'Sports': 158917, 'Technology': 167069, 'Universe': 160159, 'World': 164604} topic_pages = pickle.load(open(local_output_root+'/topic_pages.pickle', 'rb')) # vectorizer for each topic _vectorizers = {} for k,v in topic_pages.items(): dv = DictVectorizer() dv.fit_transform(v) _vectorizers[k] = dv vectorizers = sc.broadcast(_vectorizers) _dv = DictVectorizer() features_vec = _dv.fit_transform(topics_df) dv = sc.broadcast(_dv) data = sc.pickleFile(input_path) authors_total = data.count() author_topic_entity_vec = data.filter(lambda x: len(x['topics']) > 0 and len(x['topics_freq']) > 0)\ .map(lambda x: get_feature_vectors(x, authors_total, entity_df, topics_df, vectorizers, dv) ) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_topic_entity_vec.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] botlist_csv = kwargs['botlist-csv'] # '/home/username/data/botlist.csv' stopwords_csv = kwargs[ 'stopwords-csv'] # /home/username/data/ranksnl_stopwords.csv sem_model_path = kwargs[ 'sem-model-path'] # 'hdfs://hadoop:8020/user/username/data/enwiki_pages_n3_1.model' sem_path = kwargs[ 'sem-path'] # '/home/username/tools/semanticizest/semanticizest3.zip' subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] sc.addFile(sem_model_path) sc.addPyFile(sem_path) sys.path.insert(0, SparkFiles.get('libs.zip')) from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer sc.addFile("hdfs://hadoop:8020/user/username/nltk_data", recursive=True) print("\n#sematicize():") sys.path.insert(0, SparkFiles.get(sem_path.split('/')[-1])) from semanticizest import Semanticizer print("\n#loading model...") _sem = Semanticizer(SparkFiles.get(sem_model_path.split('/')[-1])) print("\n#model loaded.") sem = sc.broadcast(_sem) with open(stopwords_csv, 'r') as f: reader = csv.reader(f) _stopwords = list(map(lambda x: x[0], list(reader))) stopwords = sc.broadcast(_stopwords) with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) import nltk nltk.data.path.append(SparkFiles.get("nltk_data")) from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize from nltk.corpus import stopwords from nltk.corpus import wordnet from nltk.stem import WordNetLemmatizer lemmatizer = sc.broadcast(WordNetLemmatizer()) from nltk.sentiment.vader import SentimentIntensityAnalyzer analyser = sc.broadcast(SentimentIntensityAnalyzer()) subs_data = sc.pickleFile(subs_path) coms_data = sc.pickleFile(coms_path) print("\n#Running nltk, semanticizer...") subs_entities = subs_data.map( lambda x: process_sentences(x, 'selftext', sem, lemmatizer, analyser)) coms_entities = coms_data.map( lambda x: process_sentences(x, 'body', sem, lemmatizer, analyser)) print("\n#Union, flatten, reduce...") united = sc.union([subs_entities, coms_entities]) entities_reduced = united.reduceByKey(lambda a, b: (a + b)) ### remove duplicates author_entity_cleaned = entities_reduced.map(lambda x: (x[0], list(set(x[1])))) ### calculate entity frequencies ### TODO: used for freq-entity vector jobs, run separately from this job entities_flat = author_entity_cleaned.flatMap( lambda x: [((x[0], e[2]), (e, 1)) for e in x[1]]) entities_reduced = entities_flat.reduceByKey(lambda a, b: (a[0], a[1] + b[1])) entities_grouped_by_author = entities_reduced.map( lambda x: (x[0][0], x[1])).groupByKey().mapValues(list) entities_grouped_by_author.saveAsPickleFile( '/user/username/data/output/_jobs/author_entity_freq') ### add category and top-category to entities ### TODO: run as a separate job subreddit_category = pickle.load( open( '/home/username/data/output/_jobs/subreddit_category_index.pickle', 'rb')) subreddit_topcategory = pickle.load( open('/home/username/data/output/_jobs/subreddit_topcategory.pickle', 'rb')) entities_flat = author_entity_cleaned.flatMap( lambda x: [(e[0], e[1], e[2], e[5]) for e in x[1]]).distinct() entities_categories = entities_flat.map( lambda x: x + (subreddit_category.get(x[3], 'Other'), subreddit_topcategory.get(x[3], 'Other'))) entities_categories.saveAsPickleFile( '/user/username/data/output/_jobs/author_entity_categories') ### create two dictionaries (top)category->list of entities entities_categories = sc.pickleFile( '/user/username/data/output/_jobs/author_entity_categories') _entities_categories = entities_categories.map( lambda x: (x[2], x[4], x[5])).collect() category_entities = {} topcategory_entities = {} for x in _entities_categories: e = x[0] cat = x[1] topcat = x[2] if cat not in category_entities: category_entities[cat] = {} category_entities[cat][e] = 0 if topcat not in topcategory_entities: topcategory_entities[topcat] = {} topcategory_entities[topcat][e] = 0 pickle.dump( category_entities, open('/home/username/data/output/_jobs/category_entities.pickle', 'wb')) pickle.dump( topcategory_entities, open('/home/username/data/output/_jobs/topcategory_entities.pickle', 'wb')) ### create dictionary entity-categories ### TODO: run as a separate job entity_categories_dict = {} for x in _entities_categories: e = x[0] cat = x[1] topcat = x[2] if e not in entity_categories_dict: entity_categories_dict[e] = {} entity_categories_dict[e]['cat'] = cat entity_categories_dict[e]['topcat'] = topcat pickle.dump( entity_categories_dict, open('/home/username/data/output/_jobs/entity_categories_dict.pickle', 'wb')) print("\n#Saving results...") output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_entity_cleaned.saveAsPickleFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] timestamp = int(time.time()) hdfs_root = kwargs['hdfs-root'] subs_path = kwargs['subs-path'] coms_path = kwargs['coms-path'] botlist_csv = kwargs['botlist-csv'] with open(botlist_csv, 'r') as f: reader = csv.reader(f) _botlist = list(map(lambda x: x[0], list(reader))) botlist = sc.broadcast(_botlist) ### author-submissions frequencies subs_data = sc.pickleFile(subs_path) # corpora-reddit/corpus-submissions subs_data = subs_data.filter(lambda x: True if all( k in x for k in ('author', 'subreddit')) else False) # leave only authors from pairs authors_from_pairs = set( pickle.load( open('/home/username/data/output/_jobs/authors_from_pairs.pickle', 'rb'))) subs_data = subs_data.filter(lambda x: x['author'] in authors_from_pairs) author_subreddit_submission = subs_data.map(lambda x: ( (x['author'], x['subreddit']), 1)).reduceByKey(lambda a, b: a + b) author_subreddit_submission = author_subreddit_submission.map( lambda x: { 'author': x[0][0], 'subreddit': x[0][1], 'submissions': x[1] }) ### author-comment frequencies coms_data = sc.pickleFile(coms_path) # corpora-reddit/corpus-comments coms_data = coms_data.filter(lambda x: True if all( k in x for k in ('author', 'subreddit')) else False) coms_data = coms_data.filter(lambda x: x['author'] in authors_from_pairs) author_subreddit_comment = coms_data.map(lambda x: ( (x['author'], x['subreddit']), 1)).reduceByKey(lambda a, b: a + b) author_subreddit_comment = author_subreddit_comment.map( lambda x: { 'author': x[0][0], 'subreddit': x[0][1], 'comments': x[1] }) data1 = author_subreddit_submission.map(extend).map( lambda x: ((x['author'], x['subreddit']), x)) data2 = author_subreddit_comment.map(extend).map( lambda x: ((x['author'], x['subreddit']), x)) united = data1.union(data2).reduceByKey(reduce_subreddit) print('united:', united.count()) author_subreddit_union = united.map(lambda x: {'author': x[1]['author'], 'submissions': x[1]['submissions'], 'subreddit': x[1]['subreddit'],\ 'comments': x[1]['comments']}) ### Filter user profile subreddits print('author_subreddit_union:', author_subreddit_union.count()) author_subreddit_union = author_subreddit_union.filter( lambda x: not x['subreddit'].startswith('u_') and x['subreddit'] != '') print('author_subreddit_union (user profiles filtered):', author_subreddit_union.count()) ### Filter bots and group by author author_subreddit_botlist = author_subreddit_union.filter( lambda x: x['author'] not in botlist.value) author_subreddit_grouped = author_subreddit_botlist.map( lambda x: (x['author'], x)).groupByKey().mapValues(list) print('author_subreddit_botlist:', author_subreddit_botlist.count()) print('author_subreddit_grouped:', author_subreddit_grouped.count()) author_subreddit = author_subreddit_grouped.map(lambda x: { "author": x[0], "subreddits": x[1] }) ### only authors from 'changemyview' author_subreddit_cmv = author_subreddit.filter(lambda x: filter_cmv(x)) author_subreddit_json = author_subreddit_cmv.map( lambda l: json.dumps(l, ensure_ascii=False).encode('utf8')) output_path = utils.hdfs_get_output_path(hdfs_root, job_name) author_subreddit_json.saveAsTextFile(output_path)
def analyze(sc, **kwargs): pp = kwargs['pp'] job_name = kwargs['job-name'] hdfs_root = kwargs['hdfs-root'] local_output_root = kwargs['local-output-root'] input_path = utils.get_input_path(hdfs_root, kwargs.get('input-job', None), kwargs.get('input-path', None)) '''TODO: - load category_entities and topcategory_entities dicts - author_entity: group list of entities into categories / topcategories ''' _subreddit_category = pickle.load( open( '/home/username/data/output/_jobs/subreddit_category_index.pickle', 'rb')) subreddit_category = sc.broadcast(_subreddit_category) _subreddit_topcategory = pickle.load( open('/home/username/data/output/_jobs/subreddit_topcategory.pickle', 'rb')) subreddit_topcategory = sc.broadcast(_subreddit_topcategory) category_entities = pickle.load( open('/home/username/data/output/_jobs/category_entities.pickle', 'rb')) topcategory_entities = pickle.load( open('/home/username/data/output/_jobs/topcategory_entities.pickle', 'rb')) _entity_df = pickle.load( open('/home/username/data/output/_jobs/entity_df.pickle', 'rb')) entity_df = sc.broadcast(_entity_df) # create vectorizer for each category out of entity_df dataset # _vectorizers_cats = {} # for k,v in category_entities.items(): # dv = DictVectorizer() # dv.fit_transform(v) # _vectorizers_cats[k] = dv # vectorizers_cats = sc.broadcast(_vectorizers_cats) # pickle.dump(_vectorizers_cats, open('/home/username/data/output/_jobs/vectorizers_cats.pickle','wb')) _vectorizers_cats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_cats.pickle', 'rb')) vectorizers_cats = sc.broadcast(_vectorizers_cats) # _vectorizers_topcats = {} # for k,v in topcategory_entities.items(): # dv = DictVectorizer() # dv.fit_transform(v) # _vectorizers_topcats[k] = dv # vectorizers_topcats = sc.broadcast(_vectorizers_topcats) # pickle.dump(_vectorizers_topcats, open('/home/username/data/output/_jobs/vectorizers_topcats.pickle','wb')) _vectorizers_topcats = pickle.load( open('/home/username/data/output/_jobs/vectorizers_topcats.pickle', 'rb')) vectorizers_topcats = sc.broadcast(_vectorizers_topcats) # author_entities_categories = sc.pickleFile('/user/username/data/output/_jobs/author_entity_categories') data = sc.pickleFile( '/user/username/data/output/_jobs/author_entity/latest') authors_total = data.count() ### calculating category_entities features vectors author_entity_features = data.map(lambda x: get_feature_vectors( x, authors_total, entity_df, vectorizers_cats, vectorizers_topcats, subreddit_category, subreddit_topcategory)) output_path = utils.hdfs_get_output_path( hdfs_root, job_name) # output_folder='/tf_squared' author_entity_features.saveAsPickleFile(output_path)