def theta_labeling(data): database = analysis.make_twoD(data, start, stop, 128, 80) Th = [] for i in range(len(database)): RI = analysis.R_I(data, i) R = RI[0] I = RI[1] if analysis.dots(database[i]) >= 85: error = analysis.for_error(R, I) if error >= 0.85 and error <= 1: print(i, error) theta = analysis.ura(R, I) else: theta = analysis.circle_fit(R, I)[1] Th.append(theta) else: Th.append(0) angle = analysis.scatter(Th, 10) plt.plot(angle[0], angle[1]) plt.show() Th_label = [] for i in Th: if i >= 220: Th_label.append(1) else: Th_label.append(0) return Th_label
def showcorr(vartype=None): try: sessionid = session["id"] user_sess = UserSession.query.filter(UserSession.id == sessionid).first() pid = user_sess.pid aid = user_sess.aid p = Project.query.filter(Project.id == pid).first() vartype = str(vartype) if request.form is not None and "vartype" in request.form: vartype = request.form["vartype"] if vartype == "regress": if "rcont" not in session.keys(): rcont = list(set(request.form.getlist("variables"))) """UserSession.query.filter_by(id=sessionid).update({"rcont":rcont}) db_session.commit()""" session["rcont"] = rcont return redirect(url_for("regress")) elif vartype == "output": corrvars = session["ovars"] elif vartype == "input": if "rout" not in session.keys(): rout = list(set(request.form.getlist("variables"))) """UserSession.query.filter_by(id=sessionid).update({"rout":rout}) db_session.commit()""" session["rout"] = rout corrvars = session["ivars"] else: if "rinp" not in session.keys(): rinp = list(set(request.form.getlist("variables"))) """UserSession.query.filter_by(id=sessionid).update({"rinp":rinp}) corrvars=user_sess.control""" session["rinp"] = rinp corrvars = session["cvars"] # ivars=user_sess.input # ivars=session["ivars"] ivars = session["ivars"] pid = user_sess.pid # csvf=user_sess.csvf csvf = data[pid] params = [] nocor = [] cors = [] variables = [] count = 0 plots = [] pltpath = app.config["PLOTPATH"] + "/" + vartype + "/scatter" # Plotting the scatterplots i = 0 skipPlot = False if vartype == "control": combos = itertools.combinations(corrvars + ivars, 2) else: if len(corrvars) == 1: skipPlot = True else: combos = itertools.combinations(corrvars, 2) if skipPlot is False: combos = itertools.combinations(corrvars, 2) for combo in combos: # Redundancy removal for input variables in control correlations if vartype == "control" and combo[0] in ivars and combo[1] in ivars: continue x = csvf[combo[0]].fillna(0) # x=csvf[combo[0]].replace('',0) y = csvf[combo[1]].fillna(0) corr = np.corrcoef(x, y)[0][1] corr = round(corr, 2) if corr >= 0.70: pltfile = analysis.scatter(x, y, count, combo[0], combo[1], pltpath, vartype, corr) filepath = "../static/images/plots/" + vartype + "/" + pltfile # Different path for accessing images through python files versus html files session["plots"].append(filepath[2:]) count += 1 # params.append((filepath,corr,combo[0],combo[1])) params.append((filepath, corr)) cors.append(combo[0]) cors.append(combo[1]) else: # create list of uncorrelated variables and pass it to vars if vartype == "control": if combo[0] not in ivars: nocor.append(combo[0]) if combo[1] not in ivars: nocor.append(combo[1]) else: nocor.append(combo[0]) nocor.append(combo[1]) """UserSession.query.filter_by(id=sessionid).update({"plots":plots}) db_session.commit()""" cors = list(set(cors)) nocor = list(set(nocor)) nocor = [item for item in nocor if item not in cors] variables.append(cors) variables.append(nocor) if count == 0: msg = "none" else: msg = "corr" if len(params) > 3: height = str(int(len(params) / 3) * 500) + "px" else: height = "500px" return render_template( "scatter.html", params=params, vars=variables, vartype=vartype, msg=msg, height=height ) else: if vartype == "output": rout = session["ovars"] # UserSession.query.filter_by(id=sessionid).update({"rout":rout}) session["rout"] = rout vartype = "input" elif vartype == "input": rinp = session["ivars"] # UserSession.query.filter_by(id=sessionid).update({"rinp":rinp}) session["rinp"] = rinp vartype = "control" elif vartype == "control": rcnt = session["cvars"] # UserSession.query.filter_by(id=sessionid).update({"rcont":rcnt}) session["rcont"] = rcont vartype == "regress" else: return redirect(url_for("regress")) return redirect(url_for("showcorr", vartype=vartype)) except Exception as e: app.logger.exception(traceback.format_exc()) flash("Sorry, an internal error occurred.")
def showcorr(vartype=None): try: print "session Dictionary",session vartype=str(vartype) if request.form is not None and "vartype" in request.form: vartype=request.form["vartype"] if vartype=="regress": if "rcont" not in session: session["rcont"]=list(set(request.form.getlist('variables'))) return redirect(url_for("regress")) elif vartype=="output": corrvars=session['output'] elif vartype=="input": if "rout" not in session: session["rout"]=list(set(request.form.getlist('variables'))) corrvars=session["input"] else: if "rinp" not in session: session["rinp"]=list(set(request.form.getlist('variables'))) corrvars=session["control"] ivars=session["input"] pid=session["pid"] csvf=data[pid] params=[] nocor=[] cors=[] count=0 plots=[] pltpath=app.config['PLOTPATH']+'/'+vartype+'/scatter' #Plotting the scatterplots i=0 skipPlot=False if vartype=="control": combos=itertools.combinations(corrvars+ivars,2) else: if len(corrvars)==1: skipPlot=True; else: combos=itertools.combinations(corrvars,2) if skipPlot is False: combos=itertools.combinations(corrvars,2) for combo in combos: #Redundancy removal for input variables in control correlations if vartype=="control" and combo[0] in ivars and combo[1] in ivars: continue x=csvf[combo[0]].fillna(0) #x=csvf[combo[0]].replace('',0) y=csvf[combo[1]].fillna(0) corr=np.corrcoef(x,y)[0][1] corr=round(corr,2) if corr>=0.70: pltfile=analysis.scatter(x,y,count,combo[0],combo[1],pltpath,vartype,corr) filepath='../static/images/plots/'+vartype+'/'+pltfile #Different path for accessing images through python files versus html files session["plots"].append(filepath[2:]) count+=1 params.append((filepath,corr,combo[0],combo[1])) cors.append(combo[0]) cors.append(combo[1]) else: #create list of uncorrelated variables and pass it to vars if vartype=="control": if combo[0] not in ivars: nocor.append(combo[0]) if combo[1] not in ivars: nocor.append(combo[1]) cors=list(set(cors)) nocor=list(set(nocor)) nocor=[item for item in nocor if item not in cors] if count==0: msg="none" else: msg="corr" return render_template("scatter.html",params=params,vars=nocor,vartype=vartype,msg=msg) else: if vartype=="output": session["rout"]=session["output"] vartype="input" elif vartype=="input": session["rinp"]=session["input"] vartype="control" elif vartype=="control": session["rcont"]=session["control"] vartype=="regress" else: return redirect(url_for("regress")) return redirect(url_for("showcorr",vartype=vartype)) except Exception as e: app.logger.exception(traceback.format_exc()) flash('Sorry, an internal error occurred.')
def main(context): """Main function takes a Spark SQL context.""" # --- User defined functions --- try: print("Attempting to load full dataset...") tags = ["demP", "demN", "gopP", "gopN", "djtP", "djtN"] df_full = run_full(0,context,0,0,0, True) df_sub = read_submission(context) except: sanitize = udf(sanitizeX, ArrayType(StringType())) # --- Read Files --- # print("Loading Files") df_comm = read_comments_minimal(context) df_sub = read_submission(context) df_lab = read_csv("data/labeled_data.csv", context) # # # --- Retrieving labeled comments --- # print("Retrieving labeled comments") df_c_lab = retrieve_labeled_comments(df_comm, df_lab, context) # # # --- Clean --- # print("Sanitizing labeled") df_clean = clean_df(df_c_lab, sanitize, context) # # --- Vectorize --- # print("Vectorizing labeled") df_vector, CVmodel, count_v = createCV(df_clean, context) # # --- Binary Labeling --- # print("Setting binary labels") df_labeled_training = binary_label(df_vector, context, name="training_labeled") # # --- Regression Training --- # print("Training Regression model") tags = ["demP", "demN", "gopP", "gopN", "djtP", "djtN"] models = {} for t in tags: models[t] = spark_regression.regression(df_labeled_training, t, t) # --- Run on full file --- # print("Running on full set") df_full = run_full(df_comm, context, sanitize, CVmodel, models) # Top stories and map data over time for tag in tags: analysis.top_stories(df_full, df_sub, context, tag) analysis.top_stories(df_full, df_sub, context, tag, 10) map_wrap_to_pandas(df_full, context, tag) # Scatter, sentiment, map data for t in [["demP", "demN"], ["gopP", "gopN"], ["djtP", "djtN"]]: map_wrap_to_pandas(df_full, context, t[0], t[1]) analysis.sentiment_over_time(df_full, context, t[0], t[1]) analysis.scatter(df_full, df_sub, context, t[0], t[1], 1) analysis.scatter(df_full, df_sub, context, t[0], t[1], 100) # Total Republican Scatter analysis.total_scatter(df_full, df_sub, context)