def test_analyze_bounds(self): """ Testing the bounds of the tweets values """ ana = Analyzer() assert ana.analyze("this is a test neutral tweet") <= 1.0 assert ana.analyze("this is a test neutral tweet") >= 0.0
def test_analyze_empty(self): """ Testing empty tweets and tweets including words not in the dictionary """ ana = Analyzer() assert ana.analyze("") == 0.5 assert ana.analyze("hzoehfsdl") == 0.5
def test_analyze_judgement_weight(self): """ Testing the value order of arbitrary tweets """ ana = Analyzer() assert ana.analyze("i am so happy, great day :D") > ana.analyze( "i am so happy :D") assert ana.analyze("so sad, feeling depressed :'(") < ana.analyze( "so depressed :'(")
def test_analyze_judgement(self): """ Testing the proper judgement of the sentiment analysis: * positive and negative * best and worse tweet values """ ana = Analyzer() assert ana.analyze(":)") > 0.5 and ana.analyze(":'(") < 0.5 assert ana.analyze("yahoo yahoo yahoo") == 1.0 assert ana.analyze("zzz zzz zzz zzz zzz") == 0.0
def test_categories_cardinality(self): """ Testing the cardinality of the different categorie sums (positive, negative, neutral) """ ana = Analyzer() ctg_count = {'positive': 0, 'negative': 0, 'neutral': 0} text = 'great day today lol ;) but still have to work' assert ana.categories_cardinality(text, ctg_count) == 15 assert ctg_count['positive'] == 4 # great day lol ;) assert ctg_count['neutral'] == 1 # today assert ctg_count['negative'] == 2 # work still
def test_categories_weight(self): """ Testing the weights of the different categorie sums (positive, negative, neutral) """ ana = Analyzer() ctg_total = {'positive': 0.0, 'negative': 0.0, 'neutral': 0.0} ctg_count = {'positive': 4, 'negative': 2, 'neutral': 1} data = [2, 3, 0, 2, 2, 0, -4, 0, 0, -2, 2] tot_pos, tot_neg, tot_neu = ana.weight_categories( data, ctg_total, ctg_count) assert (tot_pos, tot_neg, tot_neu) == (99.47646509317096, -49.392885301738836, 3.9750077625545726)
def analyze(chart_ids: List[str] = [], src: str = CHART_PATH, dest: str = default_excel_path): """ Analyzes charts given a list of IDs. If you want to analyze all levels in src, don't input any IDs. """ if len(chart_ids) == 0: with os.scandir(src) as dir_items: chart_ids = [ cid.name for cid in dir_items if is_chart_folder(cid.path) ] if len(chart_ids) == 0: click.echo("No charts in the folder!") stat_list = dict() src = os.path.abspath(src) dest = os.path.abspath(dest) os.makedirs(os.path.dirname(dest), exist_ok=True) with click.progressbar(chart_ids, label=f"Analyzing {len(chart_ids)} charts...", item_show_func=lambda x: x) as prog_bar: for chart_id in prog_bar: analyzer = Analyzer(src, chart_id) analyzer.start() stats = analyzer.get_stats_as_json() stat_list[chart_id] = stats click.echo(f"Done analyzing, now saving to {dest}...") dest_folder = os.path.dirname(dest) os.makedirs(dest_folder, exist_ok=True) stat_df = pd.DataFrame.from_dict(stat_list, orient="index") stat_df.index.name = "chart_id" excel_writer = ExcelWriter(stat_df, dest) excel_writer.format_table() excel_writer.close() click.echo("Stats successfully saved.")
h = TweetLoader('', path='data/backup/', filename='hillary_2016-07-13.json') t = TweetLoader('', path='data/backup/', filename='trump_2016-07-13.json') h.load() t.load() # Join them together full_tweets = pd.concat([h.tweets, t.tweets]) # Assign label (second array) for Hillary(0)/Trump(1) tweets label_array = np.array([0] * len(h.tweets) + [1] * len(t.tweets)) # Run through part of the model to get the PCA results and loading factors # This is not the full model, just a part of it for illustration purposes max_words = 50 mod = Analyzer(full_tweets['text'], labels=label_array, max_words=max_words, load_pca=False) # mod.load_words() mod.get_words() mod.create_dtm() mod.run_pca() loadings = mod.loadings loadings.index = ['PC' + str(j + 1) for j in range(len(loadings))] # loadings = loadings.iloc[0:30, :] # Use only a subset of the data loadings = loadings.transpose() # Use rotation words = loadings.columns.tolist() pc_names = loadings.index.tolist()
def main(argv=None): #read in params if argv is None: argv = sys.argv[1:] file = 'tulalens_survey_sample.csv' facet = 'result id' #standard python parsing for command line options opts = [] args = [] try: opts, args = getopt.getopt(argv, "hl", ["help", "list", "file=", "facet="]) except getopt.GetoptError as msg: print(sys.stderr, msg) print >> sys.stderr, "For help use --help" return 2 if len(args): print >> sys.stderr, "Invalid arg(s) %s" % args usage() return 2 for (opt, val) in opts: if opt in ("-h", "--help"): usage() return 0 if opt in ("-l", "--list"): list() return 0 elif opt in ("--file"): file = val elif opt in ("--facet"): facet = val.lower() else: usage() return 2 print("facet: %s" % facet) #check if facet given is in the list of survey questions #ideally this allows for quick entries with just the #question number, e.g. "--facet Q30" long_q = '' #keep track of the long form for later use valid_facet = False for long, short in SHORT_QUESTIONS.items(): #print("checking question: %s" % question) if facet in long: #turn the facet into easy to use question ids #p = "(^q\d\d?[.]).*" #m = re.match(p, long) facet = short long_q = long print("Question selected: %s" % long_q) valid_facet = True break if not valid_facet: sys.exit("facet selected is not a survey question") #parse csv file parser = CsvParse(file) answers = parser.parse() #generate analysis based on options #print("number of answer rows after parse: %s" % len(answers)) analyze = Analyzer(answers) #find the unique occurrence of each answer to the question answers_count = analyze.group_by(facet) mean = analyze.find_mean(facet, answers_count) sys.exit()
"pem_name", help= "Name of the PEM file that is needed to connect to the data collection servers." ) parser.add_argument( "database_ip", help="IP of the Postgres database that the results will be put into.") parser.add_argument("data_collector_ips", nargs='+', help="List of IPs of the data collection servers.") args = parser.parse_args() ec = External_Connector(args.pem_name, args.database_ip) # Create list of local files, first is twitter data, rest is news data files = [ "%s%d.txt" % (args.type, i) for i in range(0, len(args.data_collector_ips)) ] ec.get_data_files(args.data_collector_ips, files) a = Analyzer() # Run three analyses for each data file and upload them to database for f in files: sentiment, mood, emoticon = a.run(args.type, f) ec.insert_sentiment(args.run_id, args.type, sentiment) ec.insert_mood(args.run_id, args.type, mood) ec.insert_emoticon(args.run_id, args.type, emoticon)
# timer STOP = time.time() print(f"\t-----> Done.") print(f"\t-----> Execution time: {round(STOP-START, 2)} sec") if __name__ == "__main__": app_settings = { 'client_id': os.getenv('SPOTIFY_CLIENT_ID'), 'client_secret': os.getenv('SPOTIFY_CLIENT_SECRET'), 'redirect_uri': os.getenv('SPOTIFY_REDIRECT_URI') } # init analyzer az = Analyzer(**app_settings) # get tracks and simulate lengths # get all playlists playlists = az.user_playlists(is_author=True) start = time.time() print("-----> Gathering all tracks...", end="") # get all tracks all_tracks = [] for playlist in playlists: tracks = az.playlist_tracks(playlist['id']) # append the playlist meta data # to the track objects for i in range(len(tracks)): tracks[i]['playlist'] = playlist
def analyze(self, expr): name = "f" self.analyzer = Analyzer(expr) self.function_view.set_from_expression(expr, name=name + "(x)") self.function_view.set_font_size(40) box = ListBox("Dominio") self.box.pack_start(box, False, False, 0) domain_block = EqualBlock(TextBlock("D(f)"), TextBlock(interval_to_string(self.analyzer.domain))) box.make_row_with_child(domain_block) box = ListBox("Raíces") self.box.pack_start(box, False, False, 0) roots_block = TextBlock(set_to_string(self.analyzer.roots.keys())) box.make_row_with_child(roots_block) box = ListBox("Signo") self.box.pack_start(box, False, False, 0) if self.analyzer.positive.__class__ != sympy.EmptySet: positive_block = TextBlock("+ " + interval_to_string(self.analyzer.positive)) box.make_row_with_child(positive_block) if self.analyzer.negative.__class__ != sympy.EmptySet: negative_block = TextBlock("- " + interval_to_string(self.analyzer.negative)) box.make_row_with_child(negative_block) box = ListBox("Continuidad") self.box.pack_start(box, False, False, 0) if self.analyzer.continuity == self.analyzer.domain: block = TextBlock("f es continua en todo su dominio.") else: block = TextBlock("f es continua para los x %s %s\n" % (Chars.BELONGS, interval_to_string(self.analyzer.continuity))) box.make_row_with_child(block) box = ListBox("Ramas") self.box.pack_start(box, False, False, 0) if self.analyzer.branches[sympy.oo] is not None: block = TextBlock("f posee %s cuando" % Branch.get_name(*self.analyzer.branches[sympy.oo])) row = box.make_row_with_child(block) trend_block = TrendBlock(TextBlock("x"), TextBlock("+" + Chars.INFINITY)) trend_block.set_margin_left(10) row.add_child(trend_block) if self.analyzer.branches[-sympy.oo] is not None: block = TextBlock("f posee %s cuando" % Branch.get_name(*self.analyzer.branches[-sympy.oo])) row = box.make_row_with_child(block) trend_block = TrendBlock(TextBlock("x"), TextBlock("-" + Chars.INFINITY)) trend_block.set_margin_left(10) row.add_child(trend_block) box = ListBox("Crecimiento") self.box.pack_start(box, False, False, 0) block = MathView.new_from_expression(self.analyzer.derived, name + "'(x)") box.make_row_with_child(block) if self.analyzer.derived_things.negative.__class__ != sympy.EmptySet: block = TextBlock(name + " decrece en ") row = box.make_row_with_child(block) row.add_child(make_interval_points(self.analyzer.derived_things.negative)) if self.analyzer.derived_things.positive.__class__ != sympy.EmptySet: block = TextBlock(name + " crece en ") row = box.make_row_with_child(block) row.add_child(make_interval_points(self.analyzer.derived_things.positive)) mins, maxs = self.analyzer.get_minimums_and_maximums() if mins: block = TextBlock("Mínimos: ") row = box.make_row_with_child(block) for point in mins: _x = MathView.new_from_expression(point[0]) _y = MathView.new_from_expression(point[1]) block = PointBlock(_x, _y) row.add_child(block) if maxs: block = TextBlock("Máximos: ") row = box.make_row_with_child(block) for point in maxs: _x = MathView.new_from_expression(point[0]) _y = MathView.new_from_expression(point[1]) block = PointBlock(_x, _y) row.add_child(block) box = ListBox("Concavidad") self.box.pack_start(box, False, False, 0) block = MathView.new_from_expression(self.analyzer.derived2, name + "''(x)") box.make_row_with_child(block) if self.analyzer.derived2_things.positive.__class__ != sympy.EmptySet: block = TextBlock("f tiene concavidad positiva en: ") row = box.make_row_with_child(block) row.add_child(make_interval_points(self.analyzer.derived2_things.positive)) if self.analyzer.derived2_things.negative.__class__ != sympy.EmptySet: block = TextBlock("f tiene concavidad negativa en: ") row = box.make_row_with_child(block) row.add_child(make_interval_points(self.analyzer.derived2_things.negative)) _analyzer = Analyzer(self.analyzer.derived) mins, maxs = _analyzer.get_minimums_and_maximums() inflection_points = mins + maxs if inflection_points: block = TextBlock("Puntos de inflexión: ") row = box.make_row_with_child(block) for point in inflection_points: _x = MathView.new_from_expression(point[0]) _y = MathView.new_from_expression(point[1]) block = PointBlock(_x, _y) block.set_margin_right(10) row.add_child(block) self.show_all()
import matplotlib.pyplot as plt import seaborn as sns sys.path.insert(1, "../tools") from analysis import Analyzer from plotting import Plotter from training import Trainer if __name__ == "__main__": sns.set() plot_dir = "plots" plot_file = os.path.join(plot_dir, "rdf.png") if not os.path.exists(plot_dir): os.mkdir(plot_dir) anl = Analyzer() plter = Plotter() r_cut = 6.0 r, rdf = anl.calculate_rdf("trajs/training.traj", r_max=r_cut) rdf[np.nonzero(rdf)] /= max(rdf) cutoff = plter.polynomial(r, r_cut, gamma=5.0) plt.plot(r, rdf, label="Radial distribution function") plt.plot(r, cutoff, label="Polynomial cutoff, gamma=5.0") plt.legend() plt.title("Copper radial distribution function") plt.xlabel("Radial distance [Angstrom]") plt.ylabel("Radial distribution function (normalized to 1)") plt.savefig(plot_file)
def __init__(self, market): super(MarketThread, self).__init__() self.market = market def run(self): while not self._stop.isSet(): time.sleep(settings.HEARTBEAT) self.market.update() if __name__ == "__main__": q = Queue.Queue() p = Portfolio(20000) e = Executor(p) a = Analyzer(portfolio=p) m = Market(queue=q) trading_thread = TradingThread(queue=q, analyzer=a, events=e) market_thread = MarketThread(market=m) def receive_signal(signum, stack): print("You quit") trading_thread.stop() market_thread.stop() sys.exit(0) market_thread.start() trading_thread.start() signal.signal(signal.SIGINT, receive_signal)
import matplotlib.pyplot as plt # Load tweets s2 = TweetLoader(filename='coolstars.json', track_location=False, path='coolstars19/data/') s2.load() df = s2.tweets.copy() df.index = pd.DatetimeIndex(df['created_at']) # Using the Analyzer class max_words = 100 mod = Analyzer(df['text'], None, max_words=max_words, load_pca=False, load_svm=False, more_stop_words=['rt', 'cs19', 'cs19_uppsala']) mod.get_words() mod.create_dtm() mod.run_pca() # Exploration print_dtm(mod.dtm, df['text'], 42) # Top terms in components top_factors(mod.load_squared, 0) # Plots make_biplot(mod.pcscores, None, mod.loadings, 0, 1)
import pandas as pd import numpy as np # Some global defaults max_words = 200 # Load most recent tweets from Hillary Clinton and Donald Trump # s = TweetLoader(filename='search.json', track_location=True) s = TweetLoader(filename='search_2016-07-13.json', track_location=True, path='data/backup/') s.load() # Calculate and grab model results mod = Analyzer(s.tweets['text'], max_words=max_words, load_pca=True, load_svm=True) predict = mod.load_full_model() # Hillary=0 Trump=1 s.tweets['predict'] = predict # Clean up missing coordinates df = s.tweets['geo.coordinates'] bad = df.apply(lambda x: x is None) df = df[~bad] s.tweets = s.tweets[~bad] lat = df.apply(lambda x: x[0]) lon = df.apply(lambda x: x[1]) # lat, lon = zip(*df) # Alternate # Remove Alaska and Hawaii
print("Unknown model " + args.model + ".\n") exit() # Send model weights to the device model.to(args.device) print(model) #%% """ ################### Initialize model and analyzer save ################### """ # Apply weight initialization model.apply(initializer) # Create an analyzer object analyzer = Analyzer(args) #%% """ ################### Create optimizer ################### """ # Optimizer and Loss optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, threshold=1e-6) # Use cross-entropy loss if args.model in ['ae', 'vae', 'wae', 'vae_flow']: criterion = nn.L1Loss()
# Merge tweets together, pass to Analyzer df_tweets = pd.concat([h.tweets['text'], t.tweets['text']], axis=0, join='outer', join_axes=None, ignore_index=True, keys=None, levels=None, names=None, verify_integrity=False) # Using the Analyzer class mod = Analyzer(df_tweets, label_array, max_words=max_words, load_pca=False, load_svm=False, use_sentiment=True) # mod.get_words() # mod.create_dtm() # mod.run_pca() # mod.get_sentiment() # test_predict, test_label = mod.run_svm() # One-line alternative with defaults test_predict, test_label = mod.create_full_model() # Check a PCA plot # mod.make_biplot(2, 3, max_arrow=0.2)
404 error handler used if a non existant route is requested """ return render_template('404.html'), 404 @app.errorhandler(500) def page_not_found(exc): """ 500 error handler used if there is a server error """ return render_template('500.html'), 500 if __name__ == '__main__': analyzer = Analyzer() server = SocketIOServer(('', PORT), app, resource="socket.io") tw_thread = TweetWeather(server, analyzer, name="Tweet-Weather-Thread") tw_thread.daemon = True gevent.spawn(tw_thread.new_post, server) gevent.spawn(tw_thread.connexion_lost, server) print "Application Started: http://localhost:5000" try: server.serve_forever() except KeyboardInterrupt: tw_thread.stop() server.stop() sys.exit()
# Assign label (second array) for Hillary(0)/Trump(1) tweets label_array = np.array([0] * len(h.tweets) + [1] * len(t.tweets)) df_tweets = pd.concat([h.tweets['text'], t.tweets['text']], axis=0, join='outer', join_axes=None, ignore_index=True, keys=None, levels=None, names=None, verify_integrity=False) # Using the Analyzer class to get sentiments mod = Analyzer(df_tweets, label_array) mod.get_sentiment() # Group together tweets, labels, and sentiments temp = pd.concat([h.tweets, t.tweets], axis=0, join='outer', join_axes=None, ignore_index=True, levels=None) df = pd.concat([temp, mod.sentiment, pd.DataFrame({'label': label_array})], axis=1, levels=None)