Пример #1
0
def list_hist(source):
    ''' List of lists histogram. [['hello', 1], ['you', 3], ['sir', 4]]
    Takes text. Stores each item in text, compares each item to the rest of the words in 
    text and keeps a running total. Used list account for no repeats.
    '''

    histo = []
    used = []

    text = clean(source)

    # print(text)

    for word in text:
        counter = 0
        if word in used:
            continue

        used.append(word)

        for word2 in text:
            if word == word2:
                counter += 1

        instance = [word, counter]
        histo.append(instance)

    # print(histo)
    return histo
Пример #2
0
def tuple_hist(source):
    ''' Fastest - tuples are immutable. List of tuples: [('hello', 3), ('what', 4)]
    Takes text. Stores each item in text, compares each item to the rest of the words in 
    text and keeps a running total. Used list account for no repeats.
    '''
    histo = []
    used = []

    text = clean(source)
    # print(text)

    for word in text:
        # see if we've used the word before
        counter = 0
        if word in used: 
            continue

        used.append(word)

        for word2 in text: 
            if word == word2:
                counter += 1

        instance = (word, counter)
        histo.append(instance)

    # print(histo)
    return histo
def hello_world():

    my_file = open("./words.txt", "r")
    lines = clean(my_file)
    my_histogram = list_hist(lines)
    word = prob_sample(my_histogram)

    return word
Пример #4
0
def preprocess(df, mode):

    # Clean the text
    df['question1'] = df.question1.map(lambda x: ct.clean(x))
    df['question2'] = df.question2.map(lambda x: ct.clean(x))

    # Prepare the data for the model
    print("Preparing data for model...")

    # While training, create Tokenizer object and also return labels
    if mode == 'train':
        tokenizer = tokenize(df)
        df['question1'] = tokenizer.texts_to_sequences(df.question1)
        df['question2'] = tokenizer.texts_to_sequences(df.question2)
        question1 = np.array(
            list(
                tf.keras.preprocessing.sequence.pad_sequences(df.question1,
                                                              maxlen=maxlen)))
        question2 = np.array(
            list(
                tf.keras.preprocessing.sequence.pad_sequences(df.question2,
                                                              maxlen=maxlen)))
        labels = np.array(list(df.is_duplicate))
        return question1, question2, labels, tokenizer

    # While predicting, load existing Tokenizer object
    if mode == 'predict':
        with open('../checkpoints/tokenizer.pickle', 'rb') as handle:
            tokenizer = pickle.load(handle)
        df['question1'] = tokenizer.texts_to_sequences(df.question1)
        df['question2'] = tokenizer.texts_to_sequences(df.question2)
        question1 = np.array(
            list(
                tf.keras.preprocessing.sequence.pad_sequences(df.question1,
                                                              maxlen=maxlen)))
        question2 = np.array(
            list(
                tf.keras.preprocessing.sequence.pad_sequences(df.question2,
                                                              maxlen=maxlen)))
        return question1, question2
Пример #5
0
def get_all_tweets(screen_name):
    # authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth)

    # initialize a list to hold all the tweepy Tweets
    all_tweets = []

    # make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200, tweet_mode='extended')

    # save most recent tweets
    all_tweets.extend(new_tweets)

    # save the id of the oldest tweet less one
    oldest = all_tweets[-1].id - 1

    # keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 750:
        print("getting tweets before %s" % (oldest))

        # all subsequent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)

        # save most recent tweets
        all_tweets.extend(new_tweets)

        # update the id of the oldest tweet less one
        oldest = all_tweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(all_tweets)))

    clean = open(f"/tmp/{screen_name}-clean.txt", "a")
    bad_words = get_bad_phrases(bucket, key)
    final_list = ""
    for tweet in all_tweets:
        if hasattr(tweet, "full_text"):
            raw = tweet.full_text
        else:
            raw = tweet.text

        for phrase in bad_words:
            if phrase in raw:
                raw = ""

        cleaned = clean_text.clean(raw)

        if cleaned != "":
            final_list += (cleaned + "\n")
    clean.write(final_list)
    return final_list
Пример #6
0
def list_hist(source):
    text = clean(source)

    histogram = []

    for word in text:
        instance = [word, 0]
        for word2 in text:
            if word == word2:
                instance[1] += 1
        if instance not in histogram:
            histogram.append(instance)
    # print(histogram)
    return histogram
def counts_list(source):

    histo = []
    instances = []
    used = []

    text = clean(source)
    # print(text)

    for word in text:
        # check if the word has already been accounted
        if word in used:
            continue

        counter = 0
        used.append(word)
        # for each word in the text if it matches a word in the same text,
        # we have an instance of that word - so increase counter by 1
        for word2 in text:
            if word == word2:
                counter += 1
        # we know the word and we have the occurances stored in counter.
        # create a list instance object with the word and its occurances
        # and append it to the list of word instances.
        instance = [word, counter]
        instances.append(instance)

    used_nums = []
    for item in instances:
        #  check if the word frequency has been accounted for before
        if item[1] in used_nums:
            continue
        used_nums.append(item[1])
        membs = []
        new_instance = (
            item[1], membs
        )  # this is what an instance of our histogram looks like
        # for one item in our instances we check if the frequency matches
        # any other frequencies in the instances list. if it does we add those to members list
        for item2 in instances:
            if item[1] == item2[1]:
                # print(item2[0])
                membs.append(item2[0])

        histo.append(new_instance)

    # print(histo)
    return histo
Пример #8
0
def dict_hist(source):
    '''opens a text file
        creates empty dictionary
        appends null instance and adds 1 
        or adds 1 to the existing instance 
    '''
    histogram = {}
    text = clean(source)

    # print(text)
    for word in text:
        if word not in histogram:
            histogram[word] = 0
        histogram[word] += 1
    # print(histogram)
    return histogram
Пример #9
0
def stochastic_sample(source):
    ''' Histogram -> percentages -> random item'''
    hist = invert_hist(source)
    percentages = {}
    # print(hist)
    text = clean(source)

    # with open(text) as file:
    #     word_list = (file.read().split(" "))
    length = len(text)
    dart = random.randint(0, 100)  # index is not length
    # print(length)
    # print(word_list)
    # print("dart: {}".format(dart))

    # calculate total for percent
    total = 0
    for item in hist:
        total += (len(item[1]) * item[0])
        # print("len item: {}".format(len(item[1]))) # number of words per index
# individual percentages added to percent dict
    for item in hist:
        for word in item[1]:
            word_percentage = item[0] / total
            percentages[word] = word_percentage * 100


# finding the place the dart hit. each word has percentage assigned, we add them until we reach the dart
    num = 0
    target = None
    for word in percentages:
        num += percentages[word]
        target = word
        # print("trial - num: {}, target: {}".format(num, target)) #prints each trial through the hist
        if num > dart:  # we add til we break it. if we cross the line, we return
            break

    # print("num: {}, target: {}".format(num, target))
    # print('')
    # print(hist)
    # print('')
    # print(percentages)
    # print(target)
    # print(str(target))
    return str(target)
Пример #10
0
def dict_hist(source):
    ''' Dictionary key value pairs {'hello':1, 'sir':2, 'how':5}
    Takes text. Stores each item in text, compares each item to the rest of the words in 
    text and keeps a running total. Used list account for no repeats.'''

    histo_dict = {}

    text = clean(source)
    # print(text)

    for word in text:
        if word in histo_dict:
            histo_dict[word] += 1
        else:
            histo_dict[word] = 1

    # print(histo_dict)
    return histo_dict
Пример #11
0
def tuple_hist(source):
    text = clean(source)

    histogram = []
    # text = separate(text)
    cache = []

    for word in text:
        if word not in cache:
            cache.append(word)
            num_occur = 0
            for word2 in text:
                if word2 == word:
                    num_occur += 1
            instance = (word, num_occur)
            tup = tuple(instance)
            histogram.append(tup)

    return histogram
Пример #12
0
def markov(source):

    text = clean(source)

    chain = {}

    current = None
    prev = None

    for word in text:
        current = word

        if not chain:
            chain[current] = None
        else:
            chain[prev] = current

        prev = current

    return chain
Пример #13
0
x_text = np.array(twitter_df.tokens)
x_text = labelizeTweets(x_text, 'ACTUAL')

tweet_vecs = np.concatenate(
    [buildWordVector(z, n_dim) for z in map(lambda x: x.words, x_text)])
tweet_vecs = scale(tweet_vecs)
print("word vectors are created")

df = capture_sentiment(twitter_df, tweet_vecs)

df_pos = df[df['Sentiment'] == 1]
df_neg = df[df['Sentiment'] == 0]

documents = list(df_pos['SentimentText'])
doc_clean = [clean(doc).split() for doc in documents]
print("documents are cleaned")

# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
dictionary.filter_extremes()

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix.
print("training LDA started")
ldamodel = Lda(doc_term_matrix,
Пример #14
0
def generate(tweets):
    user = random.randint(0, len(tweets) - 1)

    bad_phrases = get_bad_phrases(bucket, key)

    tweet_pool = []

    if random.randint(0, 4) is 0:
        tweet_pool = tweets[user]
    else:
        for person in tweets:
            tweet_pool += person

    random.shuffle(tweet_pool)

    tweet_sample_size = 30

    bigram_dict, trigram_dict, firstWords = generate_ngrams.generate_ngrams(
        tweet_pool[0:tweet_sample_size])

    while True:
        written = ""
        written += firstWords[random.randint(0, len(firstWords) - 1)]

        while "RETRYRETRY" not in written.upper(
        ) and "terminate" not in written.lower():
            generated = generate_next_word(written, bigram_dict, trigram_dict)
            written += " " + generated

        if ("RETRYRETRY" not in written.upper()):

            valid = True

            final = re.sub(" terminate| Terminate| TERMINATE", "", written)

            final = final.rstrip()
            if len(final) > 2:
                final = final[0].capitalize() + final[1:]

            if final.count(" ") > 1 and len(final) <= 120:
                clean_sub = clean_text.clean(
                    final[int(.20 * len(final)):int(len(final) * .80)])
                for tweet in tweet_pool[0:tweet_sample_size]:
                    base_words = tweet.upper().split(" ")
                    base_words = list(dict.fromkeys(base_words))
                    base_words.sort()

                    generated_words = final.upper().split(" ")
                    generated_words = list(dict.fromkeys(generated_words))
                    generated_words.sort()

                    if base_words == generated_words:
                        valid = False
                    if clean_text.clean(clean_sub).upper() in clean_text.clean(
                            tweet).upper():
                        valid = False
                        # print ("\nnot tweeting:  ",clean_sub,"\nbecause it is in:  ",tweet,"\n")
                    for phrase in bad_phrases:
                        if phrase.upper() in tweet.upper():
                            valid = False
            else:
                valid = False
            if final.count("\"") != 2 or final.count("\"") != 0:
                final = re.sub("\"", "", final)
            if valid:
                return final
Пример #15
0
        name = tree.xpath('//div/strong/text()')

        if len(name) != 1:
            continue
        else:
            name = name[0]

        rating = tree.xpath('//span[@class="gen-user-ratings"]/text()')

        rating = [r for r in rating if r != ' ']

        if len(rating) < 1:
            continue
        else:
            vendor_rating = clean(rating[0].replace('~', '.').replace(
                '/', '.').replace(' deals', '').replace('.5, ', ' '))

        test = tree.xpath(
            '//div[@class="embedded-feedback-list"]/table/tr/td//text()')
        test = [t for t in test if t.replace(' ', '') != '']

        vals_re = re.compile('[0-5]/[0-5]')
        days_ago_re = re.compile('[0-9]+ days ago')

        if len(test) > 0:
            rating_inds = [
                ind for ind, t in enumerate(test) if vals_re.match(t)
            ]
            days_ago_inds = [
                ind for ind, t in enumerate(test) if days_ago_re.match(t)
            ]
Пример #16
0
import random
import clean_text
with open('./opspam_reviews.txt', 'r') as f:
    revs = f.readlines()

with open('./opspam_labels.txt', 'r') as f:
    labs = f.readlines()

labs = [int(l.strip()) for l in labs]

revs = [clean_text.clean(r) for r in revs]

decep = [r for r, l in zip(revs, labs) if l == 1]
nondecep = [r for r, l in zip(revs, labs) if l == 0]
nondecep = list(set(list(nondecep)))

decep_idx = random.sample(list(range(len(decep))), len(decep))
nondecep_idx = random.sample(list(range(len(nondecep))), len(nondecep))

decep = [decep[i] for i in decep_idx]
nondecep = [nondecep[i] for i in nondecep_idx]

train_decep = decep[0:640]
test_decep = decep[640:]
train_nondecep = nondecep[0:636]
test_nondecep = nondecep[636:]

train = train_decep + train_nondecep
train_labs = [1] * len(train_decep) + [0] * len(train_nondecep)
test = test_decep + test_nondecep
test_labs = [1] * len(test_decep) + [0] * len(test_nondecep)
Пример #17
0
 def __init__(self, corpus):
     self.corpus = clean(corpus)
     self.states = {}
     self.chain()
        name = tree.xpath('//div/strong/text()')

        if len(name) != 1:
            continue
        else:
            name = name[0]

        rating = tree.xpath('//span[@class="gen-user-ratings"]/text()')

        rating = [ r for r in rating if r != ' ']

        if len(rating) < 1:
            continue
        else:
            vendor_rating = clean(rating[0].replace('~', '.').replace('/', '.').replace(' deals', '').replace('.5, ', ' '))

        test = tree.xpath('//div[@class="embedded-feedback-list"]/table/tr/td//text()')
        test = [t for t in test if t.replace(' ', '') != '']

        vals_re = re.compile('[0-5]/[0-5]')
        days_ago_re = re.compile('[0-9]+ days ago')

        if len(test) > 0:
            rating_inds = [ind for ind, t in enumerate(test) if vals_re.match(t)]
            days_ago_inds = [ind for ind, t in enumerate(test) if days_ago_re.match(t)]
        else:
            continue

        # The only reviews which are really reviews are those which
        # are followed by a 'left X days ago' thing