Пример #1
0
def actLearning():

    # Unlabeled data #

    # Get the unlabeled data from csv
    unlabeledData = pd.read_csv(settings.treatedLinksPath)
    unlabeledData = unlabeledData[unlabeledData['y'].isnull()].dropna(
        how='all')

    # Clean the data
    cleanedData = functions.clean(unlabeledData)

    # Create features
    features = functions.createFeatures(cleanedData)

    # Use the data extractor from text
    textVector = variables.mlData['tFidVec'].transform(cleanedData['title'])

    # Include extracted text into features
    features = hstack([features, textVector])

    # Using the first model, predict the results for the unlabeled data and store on features
    probRFUnlabeled = variables.mlData['modelRF'].predict_proba(features)[:, 1]
    unlabeledData['p'] = probRFUnlabeled

    # Find the most difficult predictions, get some random examples and put in csv to label
    maskUnlabeled = settings.getActiveLearningFilters(unlabeledData)
    hardExamples = unlabeledData[maskUnlabeled]
    randomExamples = unlabeledData[~maskUnlabeled].sample(30)
    pd.concat([hardExamples,
               randomExamples]).to_csv(settings.actLearningExamplesPath)
Пример #2
0
def clean():
    # Read Treated Data from CSV
    treatedData = pd.read_csv(settings.treatedLinksPath)
    treatedData = treatedData[treatedData['y'].notnull()]

    # Clean the data
    cleanedData = functions.clean(treatedData)

    # Create features
    features = functions.createFeatures(cleanedData)

    # Create the y series
    y = treatedData['y'].copy()

    # Train and test segmentation
    maskTrainTest = settings.getMaskTrainTest(cleanedData)
    xTrain, xTest = features[maskTrainTest['maskTrain']], features[
        maskTrainTest['maskTest']]
    yTrain, yTest = y[maskTrainTest['maskTrain']], y[maskTrainTest['maskTest']]

    # Extract data from text
    variables.mlData[
        'tFidVec'], titleBOWTrain, titleBOWTest = functions.dataFromText(
            cleanedData['title'], maskTrainTest['maskTrain'],
            maskTrainTest['maskTest'], settings.tfidfParameters)

    # Include extracted text into training and testing
    variables.mlData['xTrain'] = functions.mergeDataFrames(
        xTrain, titleBOWTrain)
    variables.mlData['xTest'] = functions.mergeDataFrames(xTest, titleBOWTest)
    variables.mlData['yTrain'] = yTrain
    variables.mlData['yTest'] = yTest
Пример #3
0
def actLearningTest():

    # Read Treated Data from CSV
    treatedData = pd.read_csv(settings.treatedLinksPath)
    treatedData = treatedData[treatedData['y'].notnull()]

    # Get the new active learning labeled data
    actLearnData = pd.read_csv(settings.actLearningExamplesPath)
    actLearnData = actLearnData[actLearnData['y'].notnull()]
    actLearnData['new'] = 1

    # Concatenate the 2 datasets
    data = pd.concat([treatedData, actLearnData.drop('p', axis=1)])

    # Treat the concatenated data
    data = functions.treat(data)

    # Clean and create features
    cleanedData = functions.clean(data)
    cleanedData['new'] = data['new'].fillna(0)
    features = functions.createFeatures(cleanedData)
    y = data['y'].copy()

    # Get train and test masks
    maskTrainTest = settings.getMaskTrainTest(cleanedData)

    # Test: increase both datasets and run the first model
    xTrain, xTest = features[maskTrainTest['maskTrain']], features[
        maskTrainTest['maskTest']]
    yTrain, yTest = y[maskTrainTest['maskTrain']], y[maskTrainTest['maskTest']]

    # Extract data from text
    variables.mlData[
        'tFidVec'], titleBOWTrain, titleBOWTest = functions.dataFromText(
            cleanedData['title'], maskTrainTest['maskTrain'],
            maskTrainTest['maskTest'], settings.tfidfParameters)

    # Include extracted text into training and testing
    variables.mlData['xTrain'] = functions.mergeDataFrames(
        xTrain, titleBOWTrain)
    variables.mlData['xTest'] = functions.mergeDataFrames(xTest, titleBOWTest)
    variables.mlData['yTrain'] = yTrain
    variables.mlData['yTest'] = yTest

    # Use the model and look at the scores
    variables.mlData['modelRF'], variables.mlData['probRF'], variables.mlData[
        'apsRF'], variables.mlData['roc_aucRF'] = models.randomForestWMetrics(
            variables.mlData['xTrain'], variables.mlData['yTrain'],
            variables.mlData['xTest'], variables.mlData['yTest'])

    # store the cleaned data and features
    variables.mlData['cleanedData'] = cleanedData
    variables.mlData['features'] = features
    variables.mlData['y'] = y
Пример #4
0
def save_product():
    pdata = request.get_json()
    print(pdata)
    product = ProductDB(
        name=pdata['name'],
        price=clean(pdata['price']),
        order_cost=clean(pdata['order_cost']),
        initial_inventory=clean(pdata['initial_inventory']),
        demand_dist=pdata['demand_dist'],
        demand_p1=clean(pdata['demand_p1']),
        demand_p2=clean(pdata['demand_p2']),
        demand_p3=clean(pdata['demand_p3']),
        leadtime_dist=pdata['leadtime_dist'],
        leadtime_p1=clean(pdata['leadtime_p1']),
        leadtime_p2=clean(pdata['leadtime_p2']),
        leadtime_p3=clean(pdata['leadtime_p3']),
        )
    db.session.add(product)
    db.session.commit()
    result = {}
    return json.dumps(result), 200
Пример #5
0
def main():
    """
    main function initiates a kafka consumer, initialize the tweet database.
    Consumer consumes tweets from producer extracts features, cleanses the tweet text,
    calculates sentiments and loads the data into postgres database
    """

    with open("hashtag.txt") as f:
        hashtag = f.read()

    # set-up a Kafka consumer
    consumer = KafkaConsumer("twitter_stream_" + hashtag,
                             auto_offset_reset="earliest")
    os.system("curl -XDELETE localhost:9200/main_index")

    for msg in consumer:
        dict_data = json.loads(msg.value)
        tweet = fn.get_tweet(dict_data["text"])
        polarity, tweet_sentiment = fn.get_sentiment(tweet)
        lang = fn.detect_lang(tweet)

        # add text & sentiment to es
        es.index(
            index="main_index",
            doc_type="test_doc",
            body={
                "author": dict_data["user"]["screen_name"],
                "author_followers": dict_data["user"]["followers_count"],
                "author_statues": dict_data["user"]["statuses_count"],
                "author_verified": dict_data["user"]["verified"],
                "author_account_age":
                fn.get_age(dict_data["user"]["created_at"]),
                "created_at": dict_data["created_at"],
                "@timestamp": fn.get_date(dict_data["created_at"],
                                          to_string=False),
                "message": dict_data["text"],
                "cleaned_text": fn.clean(dict_data["text"]),
                "sentiment_function": tweet_sentiment,
                "polarity": polarity,
                "lang": lang,
                "source": fn.find_device(dict_data["source"]),
            },
        )
        print(str(tweet))
        print("\n")
Пример #6
0
 def convert(first_name, insertion, last_name, zip_code, streetnumber,
             email):
     """Convert user input to clean strings"""
     return {
         'first_name': functions.clean(first_name),
         'insertion': functions.clean(insertion, False, False, True),
         'last_name': functions.clean(last_name),
         'zip_code': functions.clean(zip_code, False, uppercase=True),
         'streetnumber': functions.clean(streetnumber,
                                         False,
                                         uppercase=True),
         'email': functions.clean(email, False, lowercase=True)
     }
Пример #7
0
def get_features(input_list):
    feature_tokens = input_list
    merged_input_list = merge(input_list)
    cleaned_input_list1 = []

    for i in merged_input_list:
        cleaned_input_list1.append(clean(i))

    cleaned_input_list = remove_stopwords(cleaned_input_list1)
    merged_cleaned_input_list = merge(cleaned_input_list)

    feature_no_sw, feature_lemma, feature_stem, feature_pos = extract_features1(
        input_list)
    feature_dp = extract_features2(merged_cleaned_input_list)
    feature_hypernyms, feature_hyponyms, feature_holonyms, feature_meronyms = get_fetures_from_wordNet(
        cleaned_input_list)
    query_dictionary = []
    query = []
    for i in range(len(input_list)):
        query = "tokens: " + "||".join(feature_tokens[i])
        query = query + " no_sw:" + "||".join(feature_no_sw[i])
        if (len(feature_lemma[i]) > 0):
            query = query + " lemmas: " + "||".join(feature_lemma[i])
        if (len(feature_stem[i]) > 0):
            query = query + " stem_words: " + "||".join(feature_stem[i])
        if (len(feature_pos[i]) > 0):
            query = query + " pos_tags: " + "||".join(feature_pos[i])
        if (len(feature_dp[i]) > 0):
            query = query + "head_words " + "||".join(feature_dp[i])
        # query = query + " head_word:" + head_word
        if (len(feature_hypernyms[i]) > 0):
            query = query + " hypernyms: " + "||".join(feature_hypernyms[i])
        if (len(feature_hyponyms[i]) > 0):
            query = query + " hyponyms: " + "||".join(feature_hyponyms[i])
        if (len(feature_holonyms[i]) > 0):
            query = query + " holonyms: " + "||".join(feature_holonyms[i])

        if (len(feature_meronyms[i]) > 0):
            query = query + " meronyms: " + "||".join(feature_meronyms[i])
        # query_dictionary.append(get_dictionary(qa_bag[i], feature_set, i))
    print('\n\n')
    print(query)
    print()
    return query
def process(time, rdd):
    print("========= %s =========" % str(time))
    try:
        if rdd.count() == 0:
            raise Exception("Empty")
        sqlContext = getSqlContextInstance(rdd.context)
        df = sqlContext.read.json(rdd, multiLine=True)

        if df.count() == 0:
            raise Exception("Empty")
        udf_func = udf(lambda x: dosentiment(x), returnType=StringType())
        # print(df.head(5))
        df = df.withColumn("sentiment", lit(udf_func(df.text)))
        # print(df.take(10))
        results = df.toJSON().map(lambda j: json.loads(j)).collect()
        # print("Sentiment done")
        for result in results:
            result["created_at"] = fn.get_date(result["created_at"])
            result["@timestamp"] = fn.get_date(result["created_at"], to_string=False)
            result["cleaned_text"] = fn.clean(result["text"])
            result["sentiment"] = json.loads(result["sentiment"])
            polarity, tweet_sentiment = fn.get_sentiment(fn.get_tweet(result["text"]))
            result["sentiment_function"] = tweet_sentiment
            result["polarity"] = polarity
            result["source"] = fn.find_device(result["source"])
            result["user_age"] = fn.get_age(result["user"]["created_at"])
            result["nb_characters"] = len(result["text"])
            for topic in top_topics:
                if topic in result["text"]:
                    result["topic"] = topic
            if hashtag in result["text"]:
                result["topic"] = hashtag
            # print("sentiment loaded")
        to_elastic(results, "main_index", "doc")
        # print("Send to elastic done")
    except Exception as e:
        print(e)
        pass
Пример #9
0
option = st.sidebar.text_input('Enter position title: ',
                               value=session_state.a,
                               key=9)
dtotal = pd.read_csv('cleaned_df.csv')

submit = st.sidebar.button('Search', key=1)
if submit:
    session_state.a = option
try:
    dtotal = dtotal[dtotal['title'].astype(str).str.contains(option)]
except:
    pass

total_length = len(dtotal)
dtotal2 = dtotal['description']
dtotal1 = functions.clean(dtotal2)
dtotal1 = functions.word_count(dtotal1)
c_let3 = functions.cleanC(dtotal2)
c_p3 = functions.C_plus(c_let3)
c_s3 = functions.C_sharp(c_let3)
test3a = Counter(c_p3) + Counter(c_s3)

ctotal = Counter(dtotal1) + Counter(test3a)
total = sum(ctotal.values())
Ctotaldict = [(i, ctotal[i] / total * 100.0) for i in ctotal]

total_result = pd.DataFrame(Ctotaldict, columns=['Tech', 'Percentage'])

total_resulty = pd.DataFrame(Ctotaldict, columns=['Tech', 'Percentage'])
total_resulty = total_resulty.set_index('Tech', drop=True)
total_result_chart = total_result.sort_values('Percentage',
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical

data = pd.read_csv('total_data_date.csv')

dtitle = data[data['title'].astype(str).str.contains(option)]
search_terms = terms.total_terms
text = dtitle.description
text_clean = functions.clean(text)

def tech_count(text):
    tech_skills = []
    List1 = [x.lower() for x in search_terms]
    List2 = [x.lower() for x in text_clean]

    for item in List2:
        if item in List1:
            tech_skills.append(item)
        else:
            None 
    return tech_skills

tech_list = tech_count(text_clean)
Пример #11
0
from functions import clean, read_transparent_png

ap = argparse.ArgumentParser()
ap.add_argument("-i",
                "--image",
                required=True,
                help="Path to the image to be scanned")
args = vars(ap.parse_args())

model = load_model("model.h5")

image = cv2.imread(args["image"], cv2.IMREAD_UNCHANGED)
if image.shape[2] == 4:
    image = read_transparent_png(args["image"])
image = clean(image)
cv2.imshow('gray', image)
cv2.waitKey(0)


def predict(img):
    image_data = img
    dataset = np.asarray(image_data)
    dataset = dataset.reshape((-1, 32, 32, 1)).astype(np.float32)
    print(dataset.shape)
    a = model.predict(dataset)[0]

    classes = np.genfromtxt('classes.csv', delimiter=',')[:, 1].astype(int)

    print(classes)
    new = dict(zip(classes, a))
Пример #12
0
    for x in range(len(link_list)):
        #for each link in link_list, check if the downloaded and cleaned file
        #already exist (maybe from previous interrupted run) and if not continue
        #by downloading and cleaning all chapters from the links
        file_name = (info['chapter_file_names'].replace(' ', '-') + '-' +
                     str(name_counter))
        file_name = f.delete_forbidden_c(f.forbidden_filenames, file_name)
        #spaces and forbidden charas aren't allowed in links, and the chapter
        #name will be the href link in the content.opf part of the epub file
        if not os.path.exists('clean-' + file_name + str(name_counter) +
                              ".xhtml"):
            f.download(link_list[x], 'raw-' + file_name + '.html')
            #download all files from link_list
            chapter_title = f.clean('raw-' + file_name + '.html',
                                    'clean-' + file_name + '.xhtml', parser,
                                    info, imgs)
            #clean all downloaded flies

        print(f'Chapter {str(x+1)}/{str(len(link_list))} ("{chapter_title}")'
              ' processed...')
        name_counter += 1

    #due to f.clean() making multiple xhtml files if there are imgs, can't
    #include the append in the loop as it's based on the link_list length
    files = os.listdir()  #make list of all files and paths in working folder
    cleaned_html_files = [
        i for i in files if i.startswith('clean') and i.endswith('.xhtml')
    ]

    #the cleaned_html_files list will be ordered lexicographically, this
Пример #13
0
    tokens = tokenize_corpus(fileName)

    questions_list, answers_list = create_questions_and_answers(tokens)

    qa_bag = []
    for i in range(0, 50):
        questions_list[i] = list(filter((0).__ne__, questions_list[i]))
        answers_list[i] = list(filter((0).__ne__, answers_list[i]))
        qa_bag.append(questions_list[i] + answers_list[i])

    print()
    merged_qa_bag = merge(qa_bag)

    cleaned_bag1 = []
    for i in merged_qa_bag:
        cleaned_bag1.append(clean(i))

    cleaned_bag = remove_stopwords(cleaned_bag1)
    cleaned_merged_bag = merge(cleaned_bag)

    # print("Hello1")
    # print(merged_qa_bag[0])

    # print("Hello")
    # print(cleaned_merged_bag[0])
    # print(qa_bag[0])
    # print("Bag")

    # print(cleaned_merged_bag)

    feature_tokens = qa_bag
Пример #14
0
from functions.regression.ridge import ridge_regression
from functions.transform import transform


def print_stage(msg):
    print("\n")
    print("#" * (len(msg) + 4))
    print("# " + msg + " #")
    print("#" * (len(msg) + 4))


# Reference data / Training Data
print_stage("Cleaning & transforming dataset_without_sale.csv (test.csv)")
ref = pd.read_csv('data/dataset_with_sale.csv')
discover_inconsistencies(ref)
ref = clean(ref)
ref = transform(ref)

# Data to Predict
print_stage("Cleaning & transforming dataset_with_sale.csv (train.csv)")
target = pd.read_csv('data/dataset_without_sale.csv')
discover_inconsistencies(target)
target = clean(target)
target = transform(target)

# Numberizing Data
# Adding Number data
int_ref = prep_regression_data(ref.copy())
int_target = prep_regression_data(target.copy())
# Checking if all dummies are included in dataset and filling in missing dummies
int_ref_length = len(int_ref.columns)
Пример #15
0
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()
        ]  # more colors: 'mcolors.XKCD_COLORS'
import logging  # This allows for seeing if the model converges. A log file is created.
#logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

#-------------Start Reading data from csv files--------------------------------
ClassRooms = pandas.read_csv('Classrooms.csv')
ClassRooms.rename(columns={'id': 'classroom_id'}, inplace=True)
ClassRooms.rename(columns={'name': 'classroom_name'}, inplace=True)
cleaned_class_name = [
    fun.clean(name) for name in ClassRooms.classroom_name
    if pandas.notnull(name)
]

Activities = pandas.read_csv('Activities.csv')
Activities.rename(columns={'id': 'activity_id'}, inplace=True)
Activities.rename(columns={'name': 'activity_name'}, inplace=True)
cleaned_act_name = [
    fun.clean(name) for name in Activities.activity_name
    if pandas.notnull(name)
]

Subjects = pandas.read_csv('Subjects.csv')
Subjects.rename(columns={'name': 'subject_name'}, inplace=True)
cleaned_sub_name = [
    fun.clean(name) for name in Subjects.subject_name if pandas.notnull(name)
Пример #16
0
import time
import functions as f
import RPi.GPIO as rp
i = f.get_channels()
#try:
#    f.clean()
#except RuntimeWarning:
#    print('NO SETUP!')
#    pass
#else:
#    channels = [29,31]
f.init_out(i)
#    c = 0
#    while c < 20:
#        rp.output(channels[0],rp.HIGH)
#        rp.output(channels[1],rp.HIGH)
#        time.sleep(0.1)
#        rp.output(channels[0],rp.LOW)
#        rp.output(channels[1],rp.LOW)
#        time.sleep(0.1)
#        c += 1
#for a in i:
c = 0
while c < 20:
    rp.output(i, rp.HIGH)
    time.sleep(0.1)
    rp.output(i, rp.LOW)
    time.sleep(0.1)
    c += 1
f.clean()
Пример #17
0
find_all_url = re.findall('data-vr-contentbox-url="(.*)">', html)

#scrape each article
text = ''
name = ''
name_and_text = "Subject: DailyAutoNews-rbc.ru\n"
#total num of articles is 15, so I limited the total number to 5
for i in range(len(find_all_url) - 10):
    url = find_all_url[i]
    page = urlopen(url)
    html = page.read().decode("utf-8")

    #find title
    name = re.findall('<title>(.*)</title>', html)
    name_and_text += '\n\n'
    for j in name:
        name_and_text += j + '\n\n'

    #find all the text between <p>
    text = re.findall('<p>(.*)</p>', html)
    for t in text:
        name_and_text += t

name_and_text = functions.cleanhtml(name_and_text)
name_and_text = functions.clean(name_and_text)
''' debug
html_to_file = open("html.txt","w")
html_to_file.write(name_and_text)
html_to_file.close()
'''