def actLearning(): # Unlabeled data # # Get the unlabeled data from csv unlabeledData = pd.read_csv(settings.treatedLinksPath) unlabeledData = unlabeledData[unlabeledData['y'].isnull()].dropna( how='all') # Clean the data cleanedData = functions.clean(unlabeledData) # Create features features = functions.createFeatures(cleanedData) # Use the data extractor from text textVector = variables.mlData['tFidVec'].transform(cleanedData['title']) # Include extracted text into features features = hstack([features, textVector]) # Using the first model, predict the results for the unlabeled data and store on features probRFUnlabeled = variables.mlData['modelRF'].predict_proba(features)[:, 1] unlabeledData['p'] = probRFUnlabeled # Find the most difficult predictions, get some random examples and put in csv to label maskUnlabeled = settings.getActiveLearningFilters(unlabeledData) hardExamples = unlabeledData[maskUnlabeled] randomExamples = unlabeledData[~maskUnlabeled].sample(30) pd.concat([hardExamples, randomExamples]).to_csv(settings.actLearningExamplesPath)
def clean(): # Read Treated Data from CSV treatedData = pd.read_csv(settings.treatedLinksPath) treatedData = treatedData[treatedData['y'].notnull()] # Clean the data cleanedData = functions.clean(treatedData) # Create features features = functions.createFeatures(cleanedData) # Create the y series y = treatedData['y'].copy() # Train and test segmentation maskTrainTest = settings.getMaskTrainTest(cleanedData) xTrain, xTest = features[maskTrainTest['maskTrain']], features[ maskTrainTest['maskTest']] yTrain, yTest = y[maskTrainTest['maskTrain']], y[maskTrainTest['maskTest']] # Extract data from text variables.mlData[ 'tFidVec'], titleBOWTrain, titleBOWTest = functions.dataFromText( cleanedData['title'], maskTrainTest['maskTrain'], maskTrainTest['maskTest'], settings.tfidfParameters) # Include extracted text into training and testing variables.mlData['xTrain'] = functions.mergeDataFrames( xTrain, titleBOWTrain) variables.mlData['xTest'] = functions.mergeDataFrames(xTest, titleBOWTest) variables.mlData['yTrain'] = yTrain variables.mlData['yTest'] = yTest
def actLearningTest(): # Read Treated Data from CSV treatedData = pd.read_csv(settings.treatedLinksPath) treatedData = treatedData[treatedData['y'].notnull()] # Get the new active learning labeled data actLearnData = pd.read_csv(settings.actLearningExamplesPath) actLearnData = actLearnData[actLearnData['y'].notnull()] actLearnData['new'] = 1 # Concatenate the 2 datasets data = pd.concat([treatedData, actLearnData.drop('p', axis=1)]) # Treat the concatenated data data = functions.treat(data) # Clean and create features cleanedData = functions.clean(data) cleanedData['new'] = data['new'].fillna(0) features = functions.createFeatures(cleanedData) y = data['y'].copy() # Get train and test masks maskTrainTest = settings.getMaskTrainTest(cleanedData) # Test: increase both datasets and run the first model xTrain, xTest = features[maskTrainTest['maskTrain']], features[ maskTrainTest['maskTest']] yTrain, yTest = y[maskTrainTest['maskTrain']], y[maskTrainTest['maskTest']] # Extract data from text variables.mlData[ 'tFidVec'], titleBOWTrain, titleBOWTest = functions.dataFromText( cleanedData['title'], maskTrainTest['maskTrain'], maskTrainTest['maskTest'], settings.tfidfParameters) # Include extracted text into training and testing variables.mlData['xTrain'] = functions.mergeDataFrames( xTrain, titleBOWTrain) variables.mlData['xTest'] = functions.mergeDataFrames(xTest, titleBOWTest) variables.mlData['yTrain'] = yTrain variables.mlData['yTest'] = yTest # Use the model and look at the scores variables.mlData['modelRF'], variables.mlData['probRF'], variables.mlData[ 'apsRF'], variables.mlData['roc_aucRF'] = models.randomForestWMetrics( variables.mlData['xTrain'], variables.mlData['yTrain'], variables.mlData['xTest'], variables.mlData['yTest']) # store the cleaned data and features variables.mlData['cleanedData'] = cleanedData variables.mlData['features'] = features variables.mlData['y'] = y
def save_product(): pdata = request.get_json() print(pdata) product = ProductDB( name=pdata['name'], price=clean(pdata['price']), order_cost=clean(pdata['order_cost']), initial_inventory=clean(pdata['initial_inventory']), demand_dist=pdata['demand_dist'], demand_p1=clean(pdata['demand_p1']), demand_p2=clean(pdata['demand_p2']), demand_p3=clean(pdata['demand_p3']), leadtime_dist=pdata['leadtime_dist'], leadtime_p1=clean(pdata['leadtime_p1']), leadtime_p2=clean(pdata['leadtime_p2']), leadtime_p3=clean(pdata['leadtime_p3']), ) db.session.add(product) db.session.commit() result = {} return json.dumps(result), 200
def main(): """ main function initiates a kafka consumer, initialize the tweet database. Consumer consumes tweets from producer extracts features, cleanses the tweet text, calculates sentiments and loads the data into postgres database """ with open("hashtag.txt") as f: hashtag = f.read() # set-up a Kafka consumer consumer = KafkaConsumer("twitter_stream_" + hashtag, auto_offset_reset="earliest") os.system("curl -XDELETE localhost:9200/main_index") for msg in consumer: dict_data = json.loads(msg.value) tweet = fn.get_tweet(dict_data["text"]) polarity, tweet_sentiment = fn.get_sentiment(tweet) lang = fn.detect_lang(tweet) # add text & sentiment to es es.index( index="main_index", doc_type="test_doc", body={ "author": dict_data["user"]["screen_name"], "author_followers": dict_data["user"]["followers_count"], "author_statues": dict_data["user"]["statuses_count"], "author_verified": dict_data["user"]["verified"], "author_account_age": fn.get_age(dict_data["user"]["created_at"]), "created_at": dict_data["created_at"], "@timestamp": fn.get_date(dict_data["created_at"], to_string=False), "message": dict_data["text"], "cleaned_text": fn.clean(dict_data["text"]), "sentiment_function": tweet_sentiment, "polarity": polarity, "lang": lang, "source": fn.find_device(dict_data["source"]), }, ) print(str(tweet)) print("\n")
def convert(first_name, insertion, last_name, zip_code, streetnumber, email): """Convert user input to clean strings""" return { 'first_name': functions.clean(first_name), 'insertion': functions.clean(insertion, False, False, True), 'last_name': functions.clean(last_name), 'zip_code': functions.clean(zip_code, False, uppercase=True), 'streetnumber': functions.clean(streetnumber, False, uppercase=True), 'email': functions.clean(email, False, lowercase=True) }
def get_features(input_list): feature_tokens = input_list merged_input_list = merge(input_list) cleaned_input_list1 = [] for i in merged_input_list: cleaned_input_list1.append(clean(i)) cleaned_input_list = remove_stopwords(cleaned_input_list1) merged_cleaned_input_list = merge(cleaned_input_list) feature_no_sw, feature_lemma, feature_stem, feature_pos = extract_features1( input_list) feature_dp = extract_features2(merged_cleaned_input_list) feature_hypernyms, feature_hyponyms, feature_holonyms, feature_meronyms = get_fetures_from_wordNet( cleaned_input_list) query_dictionary = [] query = [] for i in range(len(input_list)): query = "tokens: " + "||".join(feature_tokens[i]) query = query + " no_sw:" + "||".join(feature_no_sw[i]) if (len(feature_lemma[i]) > 0): query = query + " lemmas: " + "||".join(feature_lemma[i]) if (len(feature_stem[i]) > 0): query = query + " stem_words: " + "||".join(feature_stem[i]) if (len(feature_pos[i]) > 0): query = query + " pos_tags: " + "||".join(feature_pos[i]) if (len(feature_dp[i]) > 0): query = query + "head_words " + "||".join(feature_dp[i]) # query = query + " head_word:" + head_word if (len(feature_hypernyms[i]) > 0): query = query + " hypernyms: " + "||".join(feature_hypernyms[i]) if (len(feature_hyponyms[i]) > 0): query = query + " hyponyms: " + "||".join(feature_hyponyms[i]) if (len(feature_holonyms[i]) > 0): query = query + " holonyms: " + "||".join(feature_holonyms[i]) if (len(feature_meronyms[i]) > 0): query = query + " meronyms: " + "||".join(feature_meronyms[i]) # query_dictionary.append(get_dictionary(qa_bag[i], feature_set, i)) print('\n\n') print(query) print() return query
def process(time, rdd): print("========= %s =========" % str(time)) try: if rdd.count() == 0: raise Exception("Empty") sqlContext = getSqlContextInstance(rdd.context) df = sqlContext.read.json(rdd, multiLine=True) if df.count() == 0: raise Exception("Empty") udf_func = udf(lambda x: dosentiment(x), returnType=StringType()) # print(df.head(5)) df = df.withColumn("sentiment", lit(udf_func(df.text))) # print(df.take(10)) results = df.toJSON().map(lambda j: json.loads(j)).collect() # print("Sentiment done") for result in results: result["created_at"] = fn.get_date(result["created_at"]) result["@timestamp"] = fn.get_date(result["created_at"], to_string=False) result["cleaned_text"] = fn.clean(result["text"]) result["sentiment"] = json.loads(result["sentiment"]) polarity, tweet_sentiment = fn.get_sentiment(fn.get_tweet(result["text"])) result["sentiment_function"] = tweet_sentiment result["polarity"] = polarity result["source"] = fn.find_device(result["source"]) result["user_age"] = fn.get_age(result["user"]["created_at"]) result["nb_characters"] = len(result["text"]) for topic in top_topics: if topic in result["text"]: result["topic"] = topic if hashtag in result["text"]: result["topic"] = hashtag # print("sentiment loaded") to_elastic(results, "main_index", "doc") # print("Send to elastic done") except Exception as e: print(e) pass
option = st.sidebar.text_input('Enter position title: ', value=session_state.a, key=9) dtotal = pd.read_csv('cleaned_df.csv') submit = st.sidebar.button('Search', key=1) if submit: session_state.a = option try: dtotal = dtotal[dtotal['title'].astype(str).str.contains(option)] except: pass total_length = len(dtotal) dtotal2 = dtotal['description'] dtotal1 = functions.clean(dtotal2) dtotal1 = functions.word_count(dtotal1) c_let3 = functions.cleanC(dtotal2) c_p3 = functions.C_plus(c_let3) c_s3 = functions.C_sharp(c_let3) test3a = Counter(c_p3) + Counter(c_s3) ctotal = Counter(dtotal1) + Counter(test3a) total = sum(ctotal.values()) Ctotaldict = [(i, ctotal[i] / total * 100.0) for i in ctotal] total_result = pd.DataFrame(Ctotaldict, columns=['Tech', 'Percentage']) total_resulty = pd.DataFrame(Ctotaldict, columns=['Tech', 'Percentage']) total_resulty = total_resulty.set_index('Tech', drop=True) total_result_chart = total_result.sort_values('Percentage',
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D from keras.models import Model from keras import initializers, regularizers, constraints, optimizers, layers from keras.preprocessing import text, sequence from keras.models import Sequential, load_model from keras.layers import Dense from keras.callbacks import ModelCheckpoint from keras.utils import to_categorical data = pd.read_csv('total_data_date.csv') dtitle = data[data['title'].astype(str).str.contains(option)] search_terms = terms.total_terms text = dtitle.description text_clean = functions.clean(text) def tech_count(text): tech_skills = [] List1 = [x.lower() for x in search_terms] List2 = [x.lower() for x in text_clean] for item in List2: if item in List1: tech_skills.append(item) else: None return tech_skills tech_list = tech_count(text_clean)
from functions import clean, read_transparent_png ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="Path to the image to be scanned") args = vars(ap.parse_args()) model = load_model("model.h5") image = cv2.imread(args["image"], cv2.IMREAD_UNCHANGED) if image.shape[2] == 4: image = read_transparent_png(args["image"]) image = clean(image) cv2.imshow('gray', image) cv2.waitKey(0) def predict(img): image_data = img dataset = np.asarray(image_data) dataset = dataset.reshape((-1, 32, 32, 1)).astype(np.float32) print(dataset.shape) a = model.predict(dataset)[0] classes = np.genfromtxt('classes.csv', delimiter=',')[:, 1].astype(int) print(classes) new = dict(zip(classes, a))
for x in range(len(link_list)): #for each link in link_list, check if the downloaded and cleaned file #already exist (maybe from previous interrupted run) and if not continue #by downloading and cleaning all chapters from the links file_name = (info['chapter_file_names'].replace(' ', '-') + '-' + str(name_counter)) file_name = f.delete_forbidden_c(f.forbidden_filenames, file_name) #spaces and forbidden charas aren't allowed in links, and the chapter #name will be the href link in the content.opf part of the epub file if not os.path.exists('clean-' + file_name + str(name_counter) + ".xhtml"): f.download(link_list[x], 'raw-' + file_name + '.html') #download all files from link_list chapter_title = f.clean('raw-' + file_name + '.html', 'clean-' + file_name + '.xhtml', parser, info, imgs) #clean all downloaded flies print(f'Chapter {str(x+1)}/{str(len(link_list))} ("{chapter_title}")' ' processed...') name_counter += 1 #due to f.clean() making multiple xhtml files if there are imgs, can't #include the append in the loop as it's based on the link_list length files = os.listdir() #make list of all files and paths in working folder cleaned_html_files = [ i for i in files if i.startswith('clean') and i.endswith('.xhtml') ] #the cleaned_html_files list will be ordered lexicographically, this
tokens = tokenize_corpus(fileName) questions_list, answers_list = create_questions_and_answers(tokens) qa_bag = [] for i in range(0, 50): questions_list[i] = list(filter((0).__ne__, questions_list[i])) answers_list[i] = list(filter((0).__ne__, answers_list[i])) qa_bag.append(questions_list[i] + answers_list[i]) print() merged_qa_bag = merge(qa_bag) cleaned_bag1 = [] for i in merged_qa_bag: cleaned_bag1.append(clean(i)) cleaned_bag = remove_stopwords(cleaned_bag1) cleaned_merged_bag = merge(cleaned_bag) # print("Hello1") # print(merged_qa_bag[0]) # print("Hello") # print(cleaned_merged_bag[0]) # print(qa_bag[0]) # print("Bag") # print(cleaned_merged_bag) feature_tokens = qa_bag
from functions.regression.ridge import ridge_regression from functions.transform import transform def print_stage(msg): print("\n") print("#" * (len(msg) + 4)) print("# " + msg + " #") print("#" * (len(msg) + 4)) # Reference data / Training Data print_stage("Cleaning & transforming dataset_without_sale.csv (test.csv)") ref = pd.read_csv('data/dataset_with_sale.csv') discover_inconsistencies(ref) ref = clean(ref) ref = transform(ref) # Data to Predict print_stage("Cleaning & transforming dataset_with_sale.csv (train.csv)") target = pd.read_csv('data/dataset_without_sale.csv') discover_inconsistencies(target) target = clean(target) target = transform(target) # Numberizing Data # Adding Number data int_ref = prep_regression_data(ref.copy()) int_target = prep_regression_data(target.copy()) # Checking if all dummies are included in dataset and filling in missing dummies int_ref_length = len(int_ref.columns)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items() ] # more colors: 'mcolors.XKCD_COLORS' import logging # This allows for seeing if the model converges. A log file is created. #logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) from gensim.scripts.glove2word2vec import glove2word2vec from gensim.models.keyedvectors import KeyedVectors #-------------Start Reading data from csv files-------------------------------- ClassRooms = pandas.read_csv('Classrooms.csv') ClassRooms.rename(columns={'id': 'classroom_id'}, inplace=True) ClassRooms.rename(columns={'name': 'classroom_name'}, inplace=True) cleaned_class_name = [ fun.clean(name) for name in ClassRooms.classroom_name if pandas.notnull(name) ] Activities = pandas.read_csv('Activities.csv') Activities.rename(columns={'id': 'activity_id'}, inplace=True) Activities.rename(columns={'name': 'activity_name'}, inplace=True) cleaned_act_name = [ fun.clean(name) for name in Activities.activity_name if pandas.notnull(name) ] Subjects = pandas.read_csv('Subjects.csv') Subjects.rename(columns={'name': 'subject_name'}, inplace=True) cleaned_sub_name = [ fun.clean(name) for name in Subjects.subject_name if pandas.notnull(name)
import time import functions as f import RPi.GPIO as rp i = f.get_channels() #try: # f.clean() #except RuntimeWarning: # print('NO SETUP!') # pass #else: # channels = [29,31] f.init_out(i) # c = 0 # while c < 20: # rp.output(channels[0],rp.HIGH) # rp.output(channels[1],rp.HIGH) # time.sleep(0.1) # rp.output(channels[0],rp.LOW) # rp.output(channels[1],rp.LOW) # time.sleep(0.1) # c += 1 #for a in i: c = 0 while c < 20: rp.output(i, rp.HIGH) time.sleep(0.1) rp.output(i, rp.LOW) time.sleep(0.1) c += 1 f.clean()
find_all_url = re.findall('data-vr-contentbox-url="(.*)">', html) #scrape each article text = '' name = '' name_and_text = "Subject: DailyAutoNews-rbc.ru\n" #total num of articles is 15, so I limited the total number to 5 for i in range(len(find_all_url) - 10): url = find_all_url[i] page = urlopen(url) html = page.read().decode("utf-8") #find title name = re.findall('<title>(.*)</title>', html) name_and_text += '\n\n' for j in name: name_and_text += j + '\n\n' #find all the text between <p> text = re.findall('<p>(.*)</p>', html) for t in text: name_and_text += t name_and_text = functions.cleanhtml(name_and_text) name_and_text = functions.clean(name_and_text) ''' debug html_to_file = open("html.txt","w") html_to_file.write(name_and_text) html_to_file.close() '''