def getFlair(): link = request.args.get('link') reddit = praw.Reddit(client_id='', client_secret='', user_agent='', username='', password='') submission = reddit.submission(url=link) posts = [] submission.comments.replace_more(limit=None) commentList = '' for comment in submission.comments: commentList += " " + comment.body posts.append([ submission.link_flair_text, submission.title, submission.id, submission.url, submission.created, commentList, submission.author ]) data = pandas.DataFrame(posts, columns=[ 'link_flair_text', 'title', 'id', 'url', 'created', 'comments', 'author' ]) data[['title']] = data.apply(lambda x: cleaning.clean(x['title']), axis=1) data[['comments']] = data.apply(lambda x: cleaning.clean(x['comments']), axis=1) X = data['title'] + data['url'] + data['comments'] + data['id'] loaded_model = pickle.load(open("log_reg_combined_model.sav", 'rb')) predicted_flair = loaded_model.predict(X) return str(predicted_flair)
def start(maxTerm, keyword, pbarData): print("Your search term is: " + keyword) DATA_PATH = './input_data/tweets.csv' FOLDER = './input_data' SAVE_PATH = './outputs/results/results.csv' # Remove any old tweets.csv try: os.remove('./input_data/tweets.csv') print("Removed tweets.csv") except OSError as e: logger.error("tweets.csv has not been created yet. " + str(e)) # Scrape twitter for tweets and save it in input_data/tweets.csv print("Scraping...") scrape( maxTerm, keyword ) # fakeuseragent error always happens here; it does not affect function. sleep(2) # Give time for the file to be created # Assert there is a new tweets.csv assert os.path.exists(DATA_PATH), "tweets.csv does not exist." print("Scraping finished. Translation being performed...") pbarData.incrementStage() # Translate tweet column. translate.translate(DATA_PATH, pbarData) print("Translation complete. Cleaning in progress...") # Clean the data cleaning.clean(DATA_PATH) # # for BERT model # print("Cleaning complete. Converting to tsv...") # # Change it into correctly formatted .tsv # # In case there is already an old dev.tsv # try: # os.remove('./input_data/dev.tsv') # print("Removed dev.tsv") # except OSError as e: # pass # # dataprocessing.change_to_tsv(DATA_PATH, FOLDER) # print("Preprocessing complete. Processing...") # # Put it into BERT. >> Remember to make BERT save the predictions somewhere # predicted = bert_eval.predict(FOLDER) # LinearSVC model print("Cleaning complete. Predicting...") processed = SVCmodel.generate(DATA_PATH) predicted = SVCmodel.predict(processed, DATA_PATH, SAVE_PATH) print("View results at ./outputs/results/results.csv") return predicted
def parse(bio, name, extras=[], translations=[], paragraphs=False, url_context='/'): # check we've got a string here assert type(bio) == str # run the new and improved cleaning bio = cleaning.clean(bio, name) # remove html special characters bio = html.unescape(bio) # convert formula to katex regex = re.compile(r'\\(?P<math>.+?)\\\\', re.MULTILINE | re.DOTALL) bio = re.sub(regex, mathreplace, bio) # convert special sets regex = re.compile(r'`(.)`', re.MULTILINE | re.DOTALL) bio = re.sub(regex, r'<latex>\mathbb{\1}</latex>', bio) # check that the link location is correct regex = re.compile(r'<a\s+href\s*=\s*[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>') bio = re.sub(regex, lambda match: urlreplace(match, url_context), bio) # convert m links to w links regex = re.compile(r'<w(?:\s+(?P<name>.+?))?>(?P<text>.*?)\<\/w\>', re.MULTILINE | re.DOTALL) bio = re.sub(regex, mreplace, bio) # convert <font color=...>...</font> to f+, etc. regex = re.compile(r'<font color\s*=\s*[\'"]?(?P<color>\w+)[\'"]?\s*>(?P<text>.*?)</font>', re.MULTILINE | re.DOTALL) bio = re.sub(regex, fontreplace, bio) # check that the image location is correct regex = re.compile(r'<d\s+(?P<content>.+?)>', re.MULTILINE | re.DOTALL) bio = re.sub(regex, lambda match: dreplace(match, url_context, name), bio) # convert to normal diagrams regex = re.compile(r'(?P<tag><img\s+.+?>)', re.MULTILINE | re.DOTALL) bio = re.sub(regex, lambda match: imgreplace(match, url_context), bio) # remove manual italics from numbers/brackets regex = re.compile(r'<\/i>([\d\[\]\(\)]+)<i>', re.MULTILINE | re.DOTALL) bio = re.sub(regex, r'\1', bio) # remove size from glossary entry regex = re.compile(r'<g\s+(?P<glossary>.+?)(?:,\d+(?:,\d+)?)?>(?P<text>.*?)\<\/g\>', re.MULTILINE | re.DOTALL) bio = re.sub(regex, r'<g \1>\2</g>', bio) # convert symbolgifs to unicode bio = symbolreplace.symbols_to_unicode(bio) # we can do smart quotes here too, actually bio = bio.replace('’', '"') bio = bio.replace('‘', '"') bio = bio.replace("“", "'") bio = bio.replace("”", "'") # remove any print-only tags bio = bio.replace('<pr>', '') bio = bio.replace('</pr>', '') return bio
def __init__(self, labels, root, transform, salient=False, n_negatives=5): self.root = root self.transforms = transform self.df = clean(root, labels) self.data = None self.n_negatives = n_negatives self.salient = salient
def nn_results(): global results if request.method == 'POST': file = request.files['file'] df = pandas.read_csv(file) #set months and clean the file months = df['month'] clean_df = clean(df).values loaded_model = pickle.load(open("nn_model.pkl", "rb")) #use predict_classes not predict print("model loaded") ynew = loaded_model.predict_classes(clean_df) print("prediction step ") #call getResults function or module : Returns json data for charts # create var results or data for getResults() results = getResults(ynew, months) print("results : ", results) #df = pandas.DataFrame(nn_results) #dlist = df.values.list() #return render_template("examples/exp.html", data=results ) return jsonify({'results': results}) else: print('GET the problem') return jsonify({'results': results})
def analyze_message(value): col_names = [ 'Station name', 'Train name', 'Category', 'Platform number', 'Is spam', 'If delay' ] output = pd.DataFrame(columns=col_names) #print(value) a, b, d, t = cleaning.clean(value) c, e, f = np.nan, np.nan, np.nan tex = pd.Series(value) numtext = tokenize.texts_to_matrix(tex) ee = fit_model1.predict(numtext) index = np.argmax(ee) if index == 0: e = False else: e = True if t is not np.nan: predicted = fit_model2.predict(t) index = np.argmax(predicted) c = encoder.inverse_transform([index]) f = cleaning.get_delay_time(c, value) #print(encoder.inverse_transform([index])) output = output.append(pd.Series([a, b, c, d, e, f], index=col_names), ignore_index=True) print(tabulate(output, headers=col_names, tablefmt='psql'))
def accept_generic_model(name=None): global results if request.method == 'POST': file = request.files['file'] df = pandas.read_csv(file) #generic cleaning clean_df = clean(df).values #Build neural network // parameters ?? nn = build_nn() #use predict_classes not predict print("results : ", results) return jsonify({'results': results}) else: print('GET the problem') return jsonify({'results': results})
def create_files(lines): for line in lines: text = line[1] text = clean(text) label = line[0] label = re.sub("[^a-zA-Z]", '', label) if (len(label) > 4): continue directory_name = 'data/input/' + label + '/' if not os.path.exists(directory_name): os.makedirs(directory_name) f = open(directory_name + "train.txt", "a+") f.write(text)
def train(): #Loads the data from the local storage synopses = [] for filename in os.listdir('cnn-stories'): with open('cnn-stories/' + filename, 'r') as infile: synopses.append(infile.read()) #Cleans the data corpus, dictionary = clean(synopses) #Saves the model and the dictionary in local storage corpora.Dictionary.save(dictionary, 'dictionary.dict') lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, update_every=5, chunksize=10000, passes=100) lda.save('lda.model')
def normalize(motion, translate='', rotate='', scale='', clean=True): """! Apply normalization to a motion. The input motion is not modified. @param motion numpy.array: The motion to normalize @param translate: The normalization for translating the motions @param rotate: The normalization for rotating the motions @param scale: The normalization for scaling the motions @param cleaning: Remove duplicate points and large jumps, default true @return: The normalized motion and the normalization parameters """ out = motion translationRef = translation.translate(out[:, 1:4], translate) rotationRef = rotation.rotate(out[:, 1:8], rotate) scalingRef = scaling.scale(out[:, 1:4], scale) out, removedPoints = cleaning.clean(motion) return out, translationRef, rotationRef, scalingRef
def create_examples(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): if (i == 0): continue id_num = "%s-%s" % (set_type, i) text = line[1] text = clean(text) label = line[0] label = re.sub("[^a-zA-Z]", '', label) label = label.lower() if (len(label) > 4): continue if (self.mode == "E/I" or self.mode == "I/E"): label = label[0] elif (self.mode == "N/S" or self.mode == "S/N"): label = label[1] elif (self.mode == "T/F" or self.mode == "F/T"): label = label[2] elif (self.mode == "J/P" or self.mode == "P/J"): label = label[3] examples.append(InputExample(guid=id_num, text=text, label=label)) return examples
reddit = praw.Reddit(client_id='', client_secret='', user_agent='', username='', password='') submission = reddit.submission(url="link_you_want_to_check") posts = [] submission.comments.replace_more(limit=None) commentList = '' for comment in submission.comments: commentList += " " + comment.body posts.append([ submission.link_flair_text, submission.title, submission.id, submission.url, submission.created, commentList, submission.author ]) data = pandas.DataFrame(posts, columns=[ 'link_flair_text', 'title', 'id', 'url', 'created', 'comments', 'author' ]) data[['title']] = data.apply(lambda x: cleaning.clean(x['title']), axis=1) data[['comments']] = data.apply(lambda x: cleaning.clean(x['comments']), axis=1) X = data['title'] + data['url'] + data['comments'] + data['id'] loaded_model = pickle.load(open("model.sav", 'rb')) predicted_flair = loaded_model.predict(X) print(predicted_flair, submission.link_flair_text)
target_column = vectorized_data['Number of Votes'] predictor_columns = vectorized_data.drop('Number of Votes', 1) vector_columns = vectorized_data[vector_headers] vectorized_data.reindex(np.random.permutation(vectorized_data.index)) NUM_ROWS = vectorized_data.shape[0] NUM_TEST = int(NUM_ROWS*.15) train_data = vectorized_data[NUM_TEST:] train_target = train_data['Number of Votes'] train_data = train_data[vector_header] test_data = vectorized_data[:NUM_TEST] test_target = test_data['Number of Votes'] test_data = test_data[vector_header] #(train_data, test_data, train_target, test_target) = ms.train_test_split(predictor_columns, target_column, test_size = 0.15) classifier = RandomForestClassifier(n_estimators=10) classifier = classifier.fit(train_data[vector_headers], train_target) results = classifier.predict(test_data[vector_headers]) output = pd.DataFrame(data={"Candidate":test_data['Candidate'], "County":test_data['County'], "Estimated Votes":results, "Actual Votes":test_target}) return output cleaned_tweets = clean() vectorized_data = vectorize(cleaned_tweets) results = prediction(vectorized_data) print(results)
import cleaning as cl from datetime import datetime from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.metrics import roc_auc_score from sklearn.svm import SVC from sklearn.model_selection import GridSearchCV df_train = pd.read_csv('datasets/train.csv', encoding = "ISO-8859-1") df_test = pd.read_csv('datasets/test.csv', encoding = "ISO-8859-1") address = pd.read_csv('datasets/addresses.csv') latlons = pd.read_csv('datasets/latlons.csv') X_train, X_test, y_train = cl.clean(df_train, df_test, address, latlons) X_train.info() X_train.shape X_test.shape y_train.shape grid_values = {'learning_rate': [0.01, 0.1, 1], 'max_depth': [3,5]} clf = GradientBoostingClassifier(random_state = 0) grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc') grid_clf_auc.fit(X_train, y_train) grid_clf_auc.best_score_ grid_values2 = {'n_estimators': [5, 10, 20], 'max_features': [2,3,4], 'max_depth': [3,5]}
modelFileLoad2 = open('models/model_m1', 'rb') encoder = LabelEncoder() encoder.classes_ = np.load('labelencoder/encoder_m1.npy') fit_model1 = pickle.load(modelFileLoad1) fit_model2 = pickle.load(modelFileLoad2) col_names = [ 'Message', 'Station name', 'Train name', 'Category', 'Platform number', 'Is spam', 'If delay' ] output = pd.DataFrame(columns=col_names) for idx, value in enumerate(train.m): #print(value) a, b, d, t = cleaning.clean(value) c, e, f = np.nan, np.nan, np.nan tex = pd.Series(value) numtext = tokenize.texts_to_matrix(tex) ee = fit_model1.predict(numtext) index = np.argmax(ee) if index == 0: e = False else: e = True if t is not np.nan: predicted = fit_model2.predict(t) index = np.argmax(predicted) c = encoder.inverse_transform([index]) f = cleaning.get_delay_time(c, value)
def apply_clean(text): cleaned = clean_text(text) cleaned = clean(cleaned) cleaned = stemming(cleaned) cleaned = remove_lemmatization(cleaned) return cleaned
def main(Args): ''' Main function for stock price prediction :param args: arguments acquired from command lines(refer to ParseArgs() for list of args) ''' stockFilePath = Args.stockfilepath newsFilePath = Args.newsfilepath nCpuCores = Args.ncpucores testStockFilePath = Args.teststockfilepath testNewsFilePath = Args.testnewsfilepath modelTimeSeries = Args.modeltimeseries modelNews = Args.modelnews Logger.debug( "StockDataPath: {}, NewsDataPath: {}, NCpuCores: {}, TestStockFilePath: {}, TestNewsFilePath: {}" .format(stockFilePath, newsFilePath, nCpuCores, testStockFilePath, testNewsFilePath)) #Time Series Analysis dataFrame = pd.read_csv(stockFilePath, parse_dates=True, index_col="date") testDataFrame = pd.read_csv(testStockFilePath, parse_dates=True, index_col="date") colsToInterpolate = ['open', 'high', 'low', 'close', 'adj_close', 'volume'] #Interpolate dataFrame = interpolate(dataFrame, colsToInterpolate) testDataFrame = interpolate(testDataFrame, colsToInterpolate) #Preprocessing dataFrame = preprocessing(dataFrame) attributes = dataFrame.drop('close', axis=1) target = dataFrame.loc[:, 'close'] testDataFrame = preprocessing(testDataFrame) testAttributes = testDataFrame.drop('close', axis=1) testTarget = testDataFrame.loc[:, 'close'] #Normalization - converting values to comparable range attributes['volume'] /= 100000 testAttributes['volume'] /= 100000 #Predictions and errors of different algorithms for news errorTimeSeries, predictionTimeSeries = predictFromTimeSeries( attributes, target, testAttributes, testTarget, nCpuCores) #News Analysis newsDataFrame = pd.read_csv(newsFilePath, parse_dates=True, index_col='date') testNewsDataFrame = pd.read_csv(testNewsFilePath, parse_dates=True, index_col='date') #Cleaning of textual data newsAttributes, newsTarget = clean(newsDataFrame) newsTestAttributes, newsTestTarget = clean(testNewsDataFrame) #Embeddings newsAttributes, newsTestAttributes = embeddings(newsAttributes, newsTestAttributes) #Predictions and errors of different algorithms for news errorNews, predictionNews = predictFromNews(newsAttributes, newsTarget, newsTestAttributes, newsTestTarget, nCpuCores) combinePredictions(errorTimeSeries, errorNews, predictionTimeSeries, predictionNews, testTarget)
""" Runs the project. """ # Import local methods. import loading import cleaning import features import training # Load raw data into datasets. loading.load() # Clean data. cleaning.clean() # Create features. features.create_features() # Train neural network. training.train_neural_network() # Train logistic regression. training.train_logistic_regression()