def analyze(input, output): with open(input, 'r') as in_f, open(output, 'a') as out_f: for line in in_f: in_data = json.loads(line) sent = interface.predictTweet(in_data['text']) in_data['sentiment'] = sent['pos'] json.dump(in_data, out_f) out_f.write('\n')
def clean_str(str): import re str = str + " " str = re.sub("http[^ ]*[\\\]", "\\\\", str) #Remove hyperlinks str = re.sub("http[^ ]* ", " ", str) #Remove hyperlinks str = str.replace('\\n', ' ') arr = re.findall( r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*", str) #Single punctuation mark is removed, smileys remain intact arr = [i for i in arr if len(i) > 1 and i[0] != '@' ] #Remove words starting with @ (Twitter mentions) arr = [i if i[0] != '#' else i[1:] for i in arr] #Remove '#' from hashtags #arr=[i for i in arr if i!='http' and i!='com' and i!='org'] res = " ".join(arr) return res.lower().strip() fp, out = sys.argv[1].split(',') sc = pyspark_cassandra.CassandraSparkContext() data = sc.textFile(fp, 36) clean_text = data.map(json.loads) \ .map(lambda x: (x, clean_str(x['text']))) json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \ .map(json.dumps) json_preds.saveAsTextFile(out)
import interface print interface.predictTweet("I hate you") print interface.predictList(["I hate you", "I feel ambivalent about you.", "Trump in General", "Violence", "I love everything"])
if __name__ == 'main': brokers, topic = sys.argv[1:] sc = pyspark_cassandra.CassandraSparkContext() ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) clean_text = kvs.map(lambda x: json.loads(x[1])) \ .map(lambda x: (x, clean_str(x['text']))) db_dict = clean_text.map(lambda x: db_dict(x[0], predictTweet(x[1])['pos'])) trump = db_dict.filter(lambda x: x['candidate'] == 'trump') hillary = db_dict.filter(lambda x: x['candidate'] == 'hillary') bernie = db_dict.filter(lambda x: x['candidate'] == 'bernie') zodiac_killer = db_dict.filter(lambda x: x['candidate'] == 'cruz') parties = db_dict.filter(lambda x: x['candidate'] == 'parties') if not trump.isEmpty(): trump.saveToCassandra('db', 'trump') if not hillary.isEmpty(): hillary.saveToCassandra('db', 'hillary') if not bernie.isEmpty(): bernie.saveToCassandra('db', 'bernie') if not zodiac_killer.isEmpty(): zodiac_killer.saveToCassandra('db', 'cruz')
import json import sys import pyspark_cassandra def clean_str(str): import re str=str+" " str=re.sub("http[^ ]*[\\\]","\\\\",str) #Remove hyperlinks str=re.sub("http[^ ]* "," ",str) #Remove hyperlinks str=str.replace('\\n',' ') arr=re.findall(r"\w+(?:[-']\w+)*|'|[:)-.(]+|\S\w*", str) #Single punctuation mark is removed, smileys remain intact arr=[i for i in arr if len(i)>1 and i[0]!='@'] #Remove words starting with @ (Twitter mentions) arr=[i if i[0]!='#' else i[1:] for i in arr] #Remove '#' from hashtags #arr=[i for i in arr if i!='http' and i!='com' and i!='org'] res=" ".join(arr) return res.lower().strip() fp, out = sys.argv[1].split(',') sc = pyspark_cassandra.CassandraSparkContext() data = sc.textFile(fp, 36) clean_text = data.map(json.loads) \ .map(lambda x: (x, clean_str(x['text']))) json_preds = clean_text.map(lambda x: (x[0], predictTweet(x[1])['pos'])) \ .map(json.dumps) json_preds.saveAsTextFile(out)