def pipeline(website, filename): file = open(filename, "w", encoding="utf-8") print("Crawling website: ", website) web_name = url_cleaning(website) website2mcc = read_website2mcc() print("url_check: ", url_check(website)) if url_check(website): content = get_content(website) if content is not None: print("web content is not None") content_processed = clean_data(content) file.write('%s, %s\n' % (website, content_processed)) #get url list in the same domain urls = get_urls(website) print("number of urls: ", len(urls)) for url in urls: if url_check(url): content = get_content(url) if content is not None: content_processed = clean_data(content) file.write('%s, %s\n' % (url, content_processed)) print("preprocessed ", url)
def main(): tweets = tweet_dict(twitterData) #contains tweets sentiment = arg_dict.sentiment_dict( sentimentData) #contains dictionary of scores space = [''] for index in range(len(tweets)): tweet_word = tweets[index]["text"].split( ) #tokenizing every word of tweet tweet_word = preprocess.clean_stopwords( tweet_word) #removing stopwords from list of words sent_score = 0 #initially sentiment score is 0 for word in tweet_word: #accessing tweet word by word word = word.lower( ) #converting word to lower case because all words in sentiment file are in lower case word = preprocess.clean_data( word) #removing punctuations and url's from tweets if not (word.encode('utf-8', 'ignore') == ""): if word.encode('utf-8') in sentiment.keys( ): #checking if word from tweet is present in sentiment file sent_score = sent_score + int( sentiment[word]) #calculating sentiment score of word else: sent_score = sent_score if word not in space: print(word.encode("utf-8"), int(sent_score)) #printing the result to stdout
async def predict(text: TextSample, request: Request): # text = request.json["text"] text = str(text) text_str = text.split('=')[1].replace("'", "") # print('text is :',text_str) # print(type(text_str)) try: text_cleaned = preprocess.clean_data(text_str) # print(text_cleaned) out = model.predict(text_cleaned) # print(type(out)) # return jsonify({"result":out}) words = {} a1 = [] a2 = [] a3 = [] s = [] c = [] ps = [] # print(out) for item in out: tag = item['tag'].split('-') word = item['word'] if len(tag) == 2: if tag[1] == 'A1': # print(word) a1.append(word) # words['A1'] = a1.append(word) elif tag[1] == 'A2': a2.append(word) # words['A2'] = a2.append(word) elif tag[1] == 'A3': a3.append(word) # words['A3'] = a3.append(word) elif tag[1] == 'C': c.append(word) # words['C'] = c.append(word) elif tag[1] == 'S': s.append(word) # words['S'] = s.append(word) elif tag[1] == 'PC': ps.append(word) # words['PS'] = ps.append(word) words['A1'] = " ".join(a1) # address1 words['A2'] = " ".join(a2) words['A3'] = " ".join(a3) words['C'] = " ".join(c) words['S'] = " ".join(s) words['PS'] = " ".join(ps) # print(words) json_compatible_item_data = jsonable_encoder(words) return JSONResponse(content=json_compatible_item_data) except Exception as e: print(e) return {"result": "Model Failed"}
def predict(): text = request.json["text"] print(text) print(type(text)) try: text_cleaned = preprocess.clean_data(text) # print(text_cleaned) out = model.predict(text_cleaned) # print(type(out)) # return jsonify({"result":out}) words = {} a1 = [] a2 = [] a3 = [] s = [] c = [] ps = [] # print(out) for item in out: tag = item['tag'].split('-') word = item['word'] if len(tag) == 2: if tag[1] == 'A1': # print(word) a1.append(word) # words['A1'] = a1.append(word) elif tag[1] == 'A2': a2.append(word) # words['A2'] = a2.append(word) elif tag[1] == 'A3': a3.append(word) # words['A3'] = a3.append(word) elif tag[1] == 'C': c.append(word) # words['C'] = c.append(word) elif tag[1] == 'S': s.append(word) # words['S'] = s.append(word) elif tag[1] == 'PC': ps.append(word) # words['PS'] = ps.append(word) words['A1'] = " ".join(a1) #address1 words['A2'] = " ".join(a2) words['A3'] = " ".join(a3) words['C'] = " ".join(c) words['S'] = " ".join(s) words['PS'] = " ".join(ps) print(words) print(type(words)) return words except Exception as e: print(e) return jsonify({"result": "Model Failed"})
def computeSentiment(tweets, sentiments): tweet_scores = [] #term_sentiments={} for tweet in tweets: tweet_score = 0 # For every tweet set score as 0 tweet_words = tweet.split() # Tokenize every tweet tweet_words = preprocess.clean_stopwords( tweet_words) # Remove all stowprds from list of tweets for word in tweet_words: # For every word in tweet word = word.lower() # Convert it to lower case word = preprocess.clean_data(word) # Preprocess data if word in sentiments: # If word is present in sentiment file word_score = sentiments[ word] # Set word score as sentiment score tweet_score += word_score # Add score to corressponding tweet score else: word_score = 0 tweet_score += word_score # Add the term and its sentiment to the dictionary if word not in term_sentiments.keys(): term_sentiments[word] = word_score # Print term_sentiments # Add the tweet and score to tweet_scores tweet_scores.append([tweet, tweet_score]) # Print tweet_scores for term in term_sentiments: # Now for every term in dictionary of terms check if term is in known sentiments if term not in sentiments: # Unknown terms have a base score of zero and assuming they have occured once new_score = 0 occur = 1 # Find all tweets that contain new term for i in range(0, len(tweet_scores)): if term in tweet_scores[i][0]: new_score += tweet_scores[i][1] occur += 1 # Normalize the new score by number of occurences new_score /= occur term_sentiments[term] = new_score print term + " " + str(format(term_sentiments[term], '.3f'))
def predict(input): for text in input: text_cleaned = preprocess.clean_data(text['address']) print(text_cleaned) out = model.predict(text_cleaned) print(out) # print(type(out)) # return jsonify({"result":out}) words = {} a1 = [] a2 = [] a3 = [] s = [] c = [] ps = [] for item in out: tag = item['tag'].split('-') word = item['word'] if len(tag) == 2: if tag[1] == 'A1': # print(word) a1.append(word) # words['A1'] = a1.append(word) elif tag[1] == 'A2': a2.append(word) # words['A2'] = a2.append(word) elif tag[1] == 'A3': a3.append(word) # words['A3'] = a3.append(word) elif tag[1] == 'C': c.append(word) # words['C'] = c.append(word) elif tag[1] == 'S': s.append(word) # words['S'] = s.append(word) elif tag[1] == 'PC': ps.append(word) # words['PS'] = ps.append(word) words['A1'] = " ".join(a1 ) # address1 words['A2'] = " ".join(a2) words['A3'] = " ".join(a3) words['C'] = " ".join(c) words['S'] = " ".join(s) words['PS'] = " ".join(ps) print(words) return words
def process_data(training_data_path=TRAINING_DATA_PATH, data_frame=None): """Load data training data from the provided path or take the given data frame and add all features""" if data_frame is None: data_frame = load_training_data(training_data_path) load_config() data_frame = clean_data(data_frame) data_frame = load_features_async(data_frame) for text_feature in ["description", "readme"]: if text_feature in data_frame.columns: data_frame[text_feature].fillna("", inplace=True) data_frame.fillna(0, inplace=True) return data_frame
def main(): # main begins tweets = tweet_dict(twitterData) # contains tweets sentiment = sentiment_dict(sentimentData) # contains dictionary of scores for index in range(len(tweets)): # checking all the index in tweets file tweet_word = tweets[index]['text'].split() # tokenizing every word of tweet tweet_word = preprocess.clean_stopwords(tweet_word) # removing stopwords from list of words sent_score = 0 # initially sentiment score is 0 for word in tweet_word: # accessing tweet word by word word=word.lower() # converting word to lower case because all words in sentiment file are in lower case word=preprocess.clean_data(word) # removing punctuations and url's from tweets if not (word.encode('utf-8', 'ignore') == ""): # if the scanned word is not the decoded one then ignore it if word.encode('utf-8') in sentiment.keys(): # checking if word from tweet is present in sentiment file sent_score = sent_score + int(sentiment[word]) # calculating sentiment score of word else: sent_score = sent_score # if sentiment doesn't match then copy the sent_score print ("word:",word.encode("utf-8"),"sentiment_score",int(sent_score))# printing the result to stdout
plst = list(params.items()) return plst def apply_offset(data, bin_offset, sv, scorer=eval_wrapper): # data has the format of pred=0, offset_pred=1, labels=2 in the first dim data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset score = scorer(data[1], data[2]) return score # global variables xgb_num_rounds = 500 num_classes = 8 # preprocess data M = clean_data() train, test = M.data_split() columns_to_drop = M.columns_to_drop # convert data to xgb data structure xgtrain = xgb.DMatrix(train.drop(columns_to_drop, axis=1), train['Response'].values) xgtest = xgb.DMatrix(test.drop(columns_to_drop, axis=1), label=test['Response'].values) # get the parameters for xgboost plst = get_params() # train model model = xgb.train(plst, xgtrain, xgb_num_rounds) # get preds train_preds = model.predict(xgtrain, ntree_limit=model.best_iteration)
params["eta"] = 0.05 params["min_child_weight"] = 60 params["subsample"] = 0.7 params["colsample_bytree"] = 0.5 params["silent"] = 1 params["max_depth"] = 9 plst = list(params.items()) return plst # global variables xgb_num_rounds = 300 # preprocess data M = clean_data() train, test = M.data_split() columns_to_drop = M.columns_to_drop # get the parameters for xgboost plst = get_params() skf = StratifiedKFold(train['Response'].values, n_folds=3, random_state=1234) scores = [] for train_index, test_index in skf: X_train, X_test = train.iloc[train_index], train.iloc[test_index] xgtrain = xgb.DMatrix(X_train.drop(columns_to_drop, axis=1), X_train['Response'].values) xgtest = xgb.DMatrix(X_test.drop(columns_to_drop, axis=1), X_test['Response'].values)
if __name__ == "__main__": stat = None print("reading statistic.json") with open("statistic.json", "r") as f: stat = json.load(f) print("reading data.csv") df = pd.read_csv("data.csv") df = df[:19] df = df.astype(np.float64) df = clean_data(df) print("testing dataframe.....") test_data_na(df) drop_1 = [] drop_2 = [] drop_3 = [] cols = [] # start of the 3-set columns # df.columns[17] for i, col in enumerate(df.columns[17:]):
import preprocess import numpy as np import math import matplotlib.pyplot as plt num_iterations = 100 num_features = 100 alpha = 0.01 llambda = 1.5 df, probe_df = preprocess.clean_data() # train_df, test_df = preprocess.split_train_test(df) train_dict_user_id_to_index = {int(user_id): index for index, user_id in enumerate(df["user_id"].unique())} train_dict_index_to_user_id = {index: int(user_id) for index, user_id in enumerate(df["user_id"].unique())} train_dict_movie_id_to_index = {int(movie_id): index for index, movie_id in enumerate(df["movie_id"].unique())} train_dict_index_to_movie_id = {index: int(movie_id) for index, movie_id in enumerate(df["movie_id"].unique())} train_num_movies = len(df["movie_id"].unique()) train_num_users = len(df["user_id"].unique()) print("The number of movies in train data: ", train_num_movies) print("The number of users in train data: ", train_num_users) # test_num_movies = len(df["movie_id"].unique()) # test_num_users = len(df["user_id"].unique()) # print("The number of movies in test data: ", test_num_movies) # print("The number of users in test data: ", test_num_users) train_data = df.values