Python cleanup примеры, tweetclean.cleanup Python примеры использования

Пример #1

0

Показать файл

Файл: tweet_langmodel.py Проект: pstrinkle/thesis-source

def main():

    hourlyInterval = 0 # are we building hourly or daily histograms?
    rawOccurrenceModel = {} # key'd by term pairing

    # Did they provide the correct args?
    if len(sys.argv) != 4:
        usage()
        sys.exit(-1)

    # Parse command line
    if sys.argv[1] == "hourly":
        hourlyInterval = 1
    elif sys.argv[1] == "daily":
        pass
    else:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[2], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)
    
    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)

        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        if hourlyInterval:
            date = tweetdate.buildDateInt(info[0])
        else:
            date = tweetdate.buildDateDayInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])
        
        rawOccurrenceModel = \
            languagemodel.update_matrix(
                                        rawOccurrenceModel,
                                        languagemodel.build_matrix(newTweet, "-&"))

    # --------------------------------------------------------------------------
    # Debug, Dump the Raw Occurrences (not finalized)
    for k, v in rawOccurrenceModel.items():
        print "%s:%d" % (k, v)

Пример #2

0

Показать файл

Файл: find_similar_users.py Проект: pstrinkle/thesis-source

def data_pull(database_file, query):
    """Pull the data from the database."""
    
    user_tweets = {}
    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row
    
    for row in conn.cursor().execute(query):
        if row['text'] is not None:
            data = tweetclean.cleanup(row['text'], True, True)
            try:
                user_tweets[row['owner']].append(data)
            except KeyError:
                user_tweets[row['owner']] = []
                user_tweets[row['owner']].append(data)

    conn.close()

    return user_tweets

Пример #3

0

Показать файл

def data_pull(database_file, query):
    """Pull the data from the database."""

    user_tweets = {}
    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    for row in conn.cursor().execute(query):
        if row['text'] is not None:
            data = cleanup(row['text'], True, True)
            try:
                user_tweets[row['owner']].append(data)
            except KeyError:
                user_tweets[row['owner']] = []
                user_tweets[row['owner']].append(data)

    conn.close()

    return user_tweets

Пример #4

0

Показать файл

def thread_main(database_file, output_folder, users, stopwords, start, cnt):
    """
    What, what! : )
    """
    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    # --------------------------------------------------------------------------
    # Process this thread's users.
    for u in xrange(start, start + cnt):
        user_id = users[u]
        users_tweets = {}
        output = "%d\t%d\t%d\t%fm"

        start = time.clock()

        for row in conn.cursor().execute(query_tweets % user_id):
            if row['text'] is not None:
                users_tweets[row['id']] = \
                    tweetclean.cleanup(row['text'], True, True)

        curr_cnt = len(users_tweets)

        doc_tfidf, ignore = vectorspace.build_doc_tfidf(users_tweets, stopwords)

        # ----------------------------------------------------------------------
        centroids = centroid.cluster_documents(doc_tfidf)

        duration = (time.clock() - start) / 60 # for minutes

        print output % (user_id, curr_cnt, len(centroids), duration)

        with open(os.path.join(output_folder, "%d.topics" % user_id), "w") as f:
            f.write("user: %d\n#topics: %d\n" % (user_id, len(centroids)))
            # Might be better if I just implement __str__ for Centroids.
            for cen in centroids:
                f.write("%s\n" % str(centroids[cen]))
            f.write("-------------------------------------------------------\n")

    conn.close()

Пример #5

0

Показать файл

Файл: build_frames.py Проект: pstrinkle/thesis-source

def data_pull(database_file, query):
    """Pull the data from the database."""

    user_data = {}
    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    for row in conn.cursor().execute(query):
        if row['text'] is not None:
            data = cleanup(row['text'], True, True)
            twt = TweetTime(row['created'])
            uid = row['owner']

            # could probably get away with pushing this up -- like in c++.
            mdv = twt.get_month()["day_val"]

            try:
                user_data[uid].add_data(mdv, data)
            except KeyError:
                user_data[uid] = frame.FrameUser(uid, mdv, data)

    conn.close()

    return user_data

Пример #6

0

Показать файл

Файл: cluster_words.py Проект: pstrinkle/thesis-source

def main():

    cleanTweets = {}   # dictionary of the tweets by id as integer

    docFreq = {}       # dictionary of in how many documents the "word" appears
    invdocFreq = {}    # dictionary of the inverse document frequencies
    docTermFreq = {}   # dictionary of term frequencies by date as integer
    docTfIdf = {}      # similar to docTermFreq, but holds the tf-idf values

    # Did they provide the correct args?
    if len(sys.argv) != 3:
        usage()
        sys.exit(-1)

    # Pull lines
    with codecs.open(sys.argv[1], "r", 'utf-8') as f:
        tweets = f.readlines()

    # Pull stop words
    with open(sys.argv[2], "r") as f:
        stopwords = f.readlines()

    # clean them up!
    for i in xrange(0, len(stopwords)):
        stopwords[i] = stopwords[i].strip()

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        # Each tweet has <id>DATE-TIME</id> and <text>DATA</text>.
        #
        # So we'll have a dictionary<string, string> = {"id", "contents"}
        #
        # So, we'll just append to the end of the string for the dictionary
        # entry.
        info = tweetclean.extract_id(i)
        if info == None:
            sys.stderr.write("Invalid tweet hit\n")
            sys.exit(-1)

        # Add this tweet to the collection of clean ones.
        cleanTweets[info[0]] = tweetclean.cleanup(info[1], True, True)

    docLength = {}

    # --------------------------------------------------------------------------
    # Process the collected tweets
    for id in cleanTweets.keys():
        # Calculate Term Frequencies for this id/document.
        # Skip 1 letter words.
        # let's make a short list of the words we'll accept.
        pruned = [w for w in cleanTweets[id].split(' ') if len(w) > 1 and w not in stopwords]

        # skip documents that only have one word.
        if len(pruned) < 2:
            continue

        docTermFreq[id] = {} # Prepare the dictionary for that document.
        
        for w in pruned:
            try:
                docLength[id] += 1
            except KeyError:
                docLength[id] = 1

            try:
                docTermFreq[id][w] += 1
            except KeyError:
                docTermFreq[id][w] = 1

        # Contribute to the document frequencies.
        for w in docTermFreq[id]:
            try:
                docFreq[w] += 1
            except KeyError:
                docFreq[w] = 1

    # --------------------------------------------------------------------------
    # Dump how many unique terms were identified by spacing splitting.
    print "Total Count of Terms: %s" % docLength
    print "Unique Terms: %d" % len(docFreq)
    print "How many Documents: %d" % len(docTermFreq)
    
    # --------------------------------------------------------------------------
    # Remove singletons -- standard practice.
    # Skipped with tweets for now...

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq)

    # Calculate the tf-idf values.
    docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq)

    # --------------------------------------------------------------------------
    # Recap of everything we have stored.
    # docLength is the total count of all terms
    # cleanTweets    is the dictionary of the tweets by id as string
    # docFreq        is the dictionary of in how many documents the "word" appears
    # invdocFreq     is the dictionary of the inverse document frequencies
    # docTermFreq    is the dictionary of term frequencies by date as integer
    # docTfIdf       is similar to docTermFreq, but holds the tf-idf values

    # --------------------------------------------------------------------------
    # Build Centroid List
    centroids = []

    for doc, vec in docTfIdf.iteritems():
        centroids.append(centroid.Centroid(str(doc), vec))

    similarities = centroid.get_sims(centroids)
    average_sim = centroid.find_avg(centroids, True, similarities)
    stddev_sim = centroid.find_std(centroids, True, similarities)
    
    print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim)
    
    # --------------------------------------------------------------------------
    # Merge centroids by highest similarity of at least threshold  
    threshold = (average_sim + stddev_sim)

    while len(centroids) > 1:
        i, j, sim = centroid.find_max(centroids)

        # @warning: This is fairly crap.
        if sim >= threshold:
            centroids[i].add_centroid(centroids[j])
            del centroids[j]
            print "merged with sim: %.10f" % sim
        else:
            break

    print "len(centroids): %d" % len(centroids)
    print "avg(centroids): %.10f" % average_sim
    print "std(centroids): %.10f" % stddev_sim
    
    for cen in centroids:
        print centroid.topTerms(cen, 10)

Пример #7

0

Показать файл

def main():

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    database_file = sys.argv[1]
    user_id = int(sys.argv[2])
    stop_file = sys.argv[3]
    outputvocab = sys.argv[4]
    outputdata = sys.argv[5]

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.importStopWords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}
    docTermFreq = {}  # dictionary of term frequencies by date as integer
    vocab = []  # array of terms

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    for row in c.execute(query_tweets % user_id):
        users_tweets[row['id']] = row['text']

    conn.close()

    # --------------------------------------------------------------------------
    # Process tweets
    for id in users_tweets:

        if users_tweets[id] == None:  # this happens, lol.
            continue

        users_tweets[id] = tweetclean.cleanup(users_tweets[id], True, True)

        # Calculate Term Frequencies for this id/document.
        # Skip 1 letter words.

        # let's make a short list of the words we'll accept.
        pruned = [w for w in users_tweets[id].split(' ') \
                  if len(w) > 1 and w not in stopwords]

        # skip documents that only have one word.
        if len(pruned) < 2:
            continue

        docTermFreq[id] = {}  # Prepare the dictionary for that document.

        for w in pruned:
            try:
                docTermFreq[id][w] += 1
            except KeyError:
                docTermFreq[id][w] = 1

            if w not in vocab:  # slow. linear search... maybe switch to a sorted method?
                vocab.append(w)

    vocab.sort()

    # --------------------------------------------------------------------------
    # Build the vocab.txt file
    with open(outputvocab, 'w') as f:
        f.write("\n".join(vocab))

    # --------------------------------------------------------------------------
    # Given the vocab array, build the document term index + counts:
    sorted_tweets = sorted(users_tweets.keys())

    data = ""

    for id in sorted_tweets:
        try:
            lens = len(docTermFreq[id])
        except:
            continue

        print "%d" % id
        data += "%d " % lens

        for term in docTermFreq[id]:
            indx = getIndx(vocab, term)
            if indx == -1:
                sys.exit(-1)
            data += "%d:%d " % (indx, docTermFreq[id][term])

        data += "\n"

    with open(outputdata, "w") as f:
        f.write(data)

Пример #8

0

Показать файл

Файл: tweets2lda.py Проект: pstrinkle/thesis-source

def main():

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)
    
    database_file = sys.argv[1]
    user_id = int(sys.argv[2])
    stop_file = sys.argv[3]
    outputvocab = sys.argv[4]
    outputdata = sys.argv[5]    

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.importStopWords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}
    docTermFreq = {}   # dictionary of term frequencies by date as integer
    vocab = []         # array of terms
    
    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    for row in c.execute(query_tweets % user_id):
        users_tweets[row['id']] = row['text']

    conn.close()

    # --------------------------------------------------------------------------
    # Process tweets
    for id in users_tweets:

        if users_tweets[id] == None: # this happens, lol.
            continue

        users_tweets[id] = tweetclean.cleanup(users_tweets[id], True, True)
        
        # Calculate Term Frequencies for this id/document.
        # Skip 1 letter words.

        # let's make a short list of the words we'll accept.
        pruned = [w for w in users_tweets[id].split(' ') \
                  if len(w) > 1 and w not in stopwords]

        # skip documents that only have one word.
        if len(pruned) < 2:
            continue

        docTermFreq[id] = {} # Prepare the dictionary for that document.
        
        for w in pruned:
            try:
                docTermFreq[id][w] += 1
            except KeyError:
                docTermFreq[id][w] = 1
            
            if w not in vocab: # slow. linear search... maybe switch to a sorted method?
                vocab.append(w)
    
    vocab.sort()
    
    # --------------------------------------------------------------------------
    # Build the vocab.txt file
    with open(outputvocab, 'w') as f:
        f.write("\n".join(vocab))
    
    # --------------------------------------------------------------------------
    # Given the vocab array, build the document term index + counts:
    sorted_tweets = sorted(users_tweets.keys())
    
    data = ""
    
    for id in sorted_tweets:
        try:
            lens = len(docTermFreq[id])
        except:
            continue

        print "%d" % id
        data += "%d " % lens
        
        for term in docTermFreq[id]:
            indx = getIndx(vocab, term)
            if indx == -1:
                sys.exit(-1)
            data += "%d:%d " % (indx, docTermFreq[id][term])
            
        data += "\n"

    with open(outputdata, "w") as f:
        f.write(data)

Пример #9

0

Показать файл

Файл: auto_lda_tfidf.py Проект: pstrinkle/thesis-source

def main():
    """."""

    # Did they provide the correct args?
    if len(sys.argv) != 6:
        usage()
        sys.exit(-1)

    cpus = multiprocessing.cpu_count()

    # --------------------------------------------------------------------------
    # Parse the parameters.
    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    maximum = int(sys.argv[3])
    stop_file = sys.argv[4]
    output_folder = sys.argv[5]

    if minimum >= maximum:
        usage()
        sys.exit(-2)

    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    kickoff = \
"""
-------------------------------------------------------------------
parameters  :
    database  : %s
    minimum   : %d
    maximum   : %d
    output    : %s
    stop      : %s
-------------------------------------------------------------------
"""

    print kickoff % (database_file, minimum, maximum, output_folder, stop_file)

    # this won't return the 3 columns we care about.
    query_collect = \
        "select owner from tweets group by owner having count(*) >= %d and count(*) < %d"
    # "select id, contents as text from tweets where owner = %d;"
    query_prefetch = \
        "select owner, id, contents as text from tweets where owner in (%s);"

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    print "#cpus: %d" % cpus

    # --------------------------------------------------------------------------
    # Search the database file for users.
    users = []
    users_tweets = {}

    start = time.clock()

    query = query_prefetch % query_collect

    for row in c.execute(query % (minimum, maximum)):
        uid = row['owner']
        if uid not in users:
            users.append(uid)
        if row['text'] is not None:
            data = tweetclean.cleanup(row['text'], True, True)
            try:
                users_tweets[uid][row['id']] = data
            except KeyError:
                users_tweets[uid] = {}
                users_tweets[uid][row['id']] = data

    print "query time: %fm" % ((time.clock() - start) / 60)
    print "users: %d\n" % len(users)

    conn.close()

    # --------------------------------------------------------------------------
    # Process those tweets by user set.

    print "usr\tcnt\tavg\tstd\tend\tdur"

    cnt = int(math.ceil((float(len(users)) / cpus)))
    remains = len(users)
    threads = []

    for i in range(0, cpus):
        start = i * cnt

        if cnt > remains:
            cnt = remains

        print "launching thread: %d, %d" % (start, cnt)

        t = threading.Thread(target=thread_main,
                             args=(
                                 output_folder,
                                 users,
                                 users_tweets,
                                 stopwords,
                                 start,
                                 cnt,
                             ))
        threads.append(t)
        t.start()

        remains -= cnt

Пример #10

0

Показать файл

Файл: ngrams.py Проект: pstrinkle/thesis-source

def main():

    daysTweets = {} # dictionary of the tweets by date as integer
    docFreq = {}    # dictionary of document frequencies
    daysHisto = {}  # dictionary of the n-grams by date as integer

    # Did they provide the correct args?
    if len(sys.argv) != 2:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[1], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>.
        #
        # So we'll have a dictionary<string, string> = {"date", "contents"}
        #
        # So, we'll just append to the end of the string for the dictionary
        # entry.
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)
        
        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        date = tweetdate.buildDateInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])

        # Add this tweet to the collective tweet for the day.
        if date in daysTweets:
            daysTweets[date] += " " + newTweet
        else:
            daysTweets[date] = newTweet

    # End of: "for i in tweets:"
    # Thanks to python and not letting me use curly braces.

    # --------------------------------------------------------------------------
    # Process the collected tweets
    print "tweet days: %d" % len(daysTweets)
    gramSize = 3

    for day in sorted(daysTweets.keys()):
        daysHisto[day] = {} # initialize the sub-dictionary
        totalDaysTerms = 0  # for normalizing the term frequencies, so days with more tweets don't skew values.

        # This gives me values, starting at 0, growing by gramSize for length of the tweet.
        # range(0, len(daysTweets[day]), gramSize)
        # This should give you values, starting at 0 for length of the tweet.
        # range(0, len(daysTweets[day]), 1)
        #
        for j in range(0, len(daysTweets[day]), gramSize):
            # this doesn't seem to do the sliding window I was expecting but rather just chunks it.
            w = daysTweets[day][j:j + gramSize]
            
            # wu is a special format that will not screw with whitespace
            wu = "_%s_" % w
            totalDaysTerms += 1
            
            try:
                daysHisto[day][wu] += 1
            except KeyError:
                daysHisto[day][wu] = 1

            try:
                docFreq[wu] += 1
            except KeyError:
                docFreq[wu] = 1

        # print results to file for day.
        # unsorted
        for gram in daysHisto[day]:
            # I am making it smaller by the size of the document.
            v = float(daysHisto[day][gram]) / totalDaysTerms
            daysHisto[day][gram] = v

    # daysHisto Contains normalized term frequencies, not tf-idf values.
    # Normalized to account for the length of the document.  It would not
    # be difficult to modify it to contain tf-idf values.  It would just have
    # to wait until all processing is complete.

    # Dump the matrix.
    print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n"

Пример #11

0

Показать файл

Файл: ngrams_tfidf.py Проект: pstrinkle/thesis-source

def main():

    docLength = 0  # total count of all terms
    daysTweets = {}  # dictionary of the tweets by date as integer
    invdocFreq = {}  # dictionary of the inverse document frequencies
    docFreq = {}  # dictionary of document frequencies
    daysHisto = {}  # dictionary of the n-grams by date as integer

    # Did they provide the correct args?
    if len(sys.argv) != 2:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[1], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)

        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        date = tweetdate.buildDateInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])

        # Add this tweet to the collective tweet for the day.
        if date in daysTweets:
            daysTweets[date] += " " + newTweet
        else:
            daysTweets[date] = newTweet

    # End of: "for i in tweets:"
    # Thanks to python and not letting me use curly braces.

    # --------------------------------------------------------------------------
    # Process the collected tweets
    print "tweet days: %d" % len(daysTweets)
    gramSize = 3
    docLength = {}

    for day in sorted(daysTweets.keys()):
        daysHisto[day] = {}  # initialize the sub-dictionary

        # This gives me values, starting at 0, growing by gramSize for length of the tweet.
        # range(0, len(daysTweets[day]), gramSize)
        # This should give you values, starting at 0 for length of the tweet.
        # range(0, len(daysTweets[day]), 1)
        #
        for j in range(0, len(daysTweets[day]), gramSize):
            # this doesn't seem to do the sliding window I was expecting but rather just chunks it.
            w = daysTweets[day][j:j + gramSize]

            # wu is a special format that will not screw with whitespace
            wu = "_%s_" % w
            try:
                docLength[day] += 1
            except KeyError:
                docLength[day] = 1

            try:
                daysHisto[day][wu] += 1
            except KeyError:
                daysHisto[day][wu] = 1

            try:
                docFreq[wu] += 1
            except KeyError:
                docFreq[wu] = 1

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(daysHisto), docFreq)

    # Calculate the tf-idf values.
    daysHisto = vectorspace.calculate_tfidf(docLength, daysHisto, invdocFreq)

    # Dump the matrix.
    #print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n"

    # Computer cosine similarities between sequential days.
    sorted_days = sorted(daysHisto.keys())
    for i in range(0, len(sorted_days) - 1):
        print "similarity(%s, %s) = " % (str(
            sorted_days[i]), str(sorted_days[i + 1])),
        print vectorspace.cosineCompute(daysHisto[sorted_days[i]],
                                        daysHisto[sorted_days[i + 1]])

Пример #12

0

Показать файл

def thread_main(database_file, output_folder, users, stopwords, start, cnt):
    """
    Process the users in your range!
    
    Each thread gets its own hook into the database, so they don't interfere.
    
    I could use the whole Queue thing... but I don't feel like trying to get 
    that to work as well.
    """

    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    # --------------------------------------------------------------------------
    # Process this thread's users.
    for j in xrange(start, start + cnt):
        user_id = users[j]
        print "processing: %d" % user_id
        for row in c.execute(query_tweets % user_id):
            if row['text'] is not None:
                users_tweets[row['id']] = \
                    tweetclean.cleanup(row['text'], True, True)

        # only words that are greater than one letter and not in the stopword
        # list.
        texts = [[word for word in users_tweets[uid].split() \
                  if word not in stopwords and len(word) > 1] \
                    for uid in users_tweets]

        # ----------------------------------------------------------------------
        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) \
                          if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] \
                    for text in texts]

        dictionary = corpora.Dictionary(texts)
        # store the dictionary, for future reference
        #dictionary.save(os.path.join("lda_out", '%d.dict' % user_id))

        corpus = [dictionary.doc2bow(text) for text in texts]
        # store to disk, for later use
        #corpora.MmCorpus.serialize(
        #                           os.path.join(
        #                                        output_folder,
        #                                        '%d.mm' % user_id),
        #                           corpus)

        # ----------------------------------------------------------------------
        # is this different...
        #corpus = \
        #    corpora.MmCorpus(os.path.join(output_folder, '%d.mm' % user_id))

        lda = models.ldamodel.LdaModel(corpus,
                                       id2word=dictionary,
                                       chunksize=100,
                                       passes=20,
                                       num_topics=100)
        #lda.save('%d.lda' % user_id)

        # ----------------------------------------------------------------------
        topic_strings = lda.show_topics(topics=-1, formatted=True)
        # shit, they share an output_file, so they could interrupt each other.
        ### so switch to individual files...
        ###
        with open(os.path.join(output_folder, "%d.topics" % user_id),
                  "w") as f:
            f.write("user: %d\n#topics: %d\n" % (user_id, len(topic_strings)))
            for topic in topic_strings:  # could use .join
                f.write("%s\n" % str(topic))

    conn.close()

Пример #13

0

Показать файл

Файл: words.py Проект: pstrinkle/thesis-source

def main():
    # Weirdly in Python, you have free access to globals from within main().

    hourlyInterval = 0 # are we building hourly or daily histograms?
    docLength = 0      # total count of all terms
    daysTweets = {}    # dictionary of the tweets by date as integer
                                          # dictionary of the tweets by date-hour as integer
                                          
    docFreq = {}       # dictionary of in how many documents the "word" appears
    invdocFreq = {}    # dictionary of the inverse document frequencies
    docTermFreq = {}   # dictionary of term frequencies by date as integer
    docTfIdf = {}      # similar to docTermFreq, but holds the tf-idf values

    # Did they provide the correct args?
    if len(sys.argv) != 5:
        usage()
        sys.exit(-1)

    # Parse command line
    if sys.argv[1] == "hourly":
        hourlyInterval = 1
    elif sys.argv[1] == "daily":
        pass
    else:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[2], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>.
        #
        # So we'll have a dictionary<string, string> = {"date", "contents"}
        #
        # So, we'll just append to the end of the string for the dictionary
        # entry.
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)

        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        if hourlyInterval:
            date = tweetdate.buildDateInt(info[0])
        else:
            date = tweetdate.buildDateDayInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])

        # Add this tweet to the collective tweet for the day.
        if date in daysTweets:
            daysTweets[date] += " " + newTweet
        else:
            daysTweets[date] = newTweet

    # End of: "for i in tweets:"
    # Thanks to python and not letting me use curly braces.

    # --------------------------------------------------------------------------
    # Process the collected tweets
    print "tweet days: %d" % len(daysTweets)
    
    docLength = {}
    
    for day in daysTweets.keys():
        docTermFreq[day] = {} # Prepare the dictionary for that document.
        
        # Calculate Term Frequencies for this day/document.
        # Skip 1 letter words.
        for w in daysTweets[day].split(' '):
            if len(w) > 1:
                try:
                    docLength[day] += 1
                except KeyError:
                    docLength[day] = 1
                
                try:
                    docTermFreq[day][w] += 1
                except KeyError:
                    docTermFreq[day][w] = 1

        # Contribute to the document frequencies.
        for w in docTermFreq[day]:
            try:
                docFreq[w] += 1
            except KeyError:
                docFreq[w] = 1

    # --------------------------------------------------------------------------
    # Dump how many unique terms were identified by spacing splitting.
    # Dump how many days of tweets we collected.
    # For each day of tweets, dump how many unique terms were identified by space splitting.
    #
    print "sizeof documents: %s" % docLength
    print "sizeof docFreq: %d" % len(docFreq)         # this is how many unique terms
    print "sizeof docTermFreq: %d" % len(docTermFreq) # this is how many days

    for day in docTermFreq:
        print "sizeof docTermFreq[%s]: %d" % (str(day), len(docTermFreq[day])) # this is how many unique terms were in that day
        #print docTermFreq[day]

    # --------------------------------------------------------------------------
    # Remove singletons -- standard practice.
    # Skipped with tweets for now...

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq)
    
    # Calculate the tf-idf values.
    docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq)

    # Recap of everything we have stored.
    # docLength      is the total count of all terms
    # daysTweets     is the dictionary of the tweets by date as integer
    # docFreq        is the dictionary of in how many documents the "word" appears
    # invdocFreq     is the dictionary of the inverse document frequencies
    # docTermFreq    is the dictionary of term frequencies by date as integer
    # docTfIdf       is similar to docTermFreq, but holds the tf-idf values

    # Sort the lists by decreasing value and dump the information.
    # TODO: Upgrade this to print the top 15-20 or so.
    sorted_keys = sorted(docTfIdf.keys())

    print "token:weight"
    for day in sorted_keys:
        print str(day) + ":---"
        sorted_tokens = sorted(
                               docTfIdf[day].items(),
                               key=operator.itemgetter(1), # (1) is value
                               reverse=True)
        for k, v in sorted_tokens:
            print k + ":" + str(v)

    # Dump the matrix.
    with open(sys.argv[3], "w") as f:
        f.write(vectorspace.dumpMatrix(docFreq, docTfIdf) + "\n")

    # Computer cosine similarities between sequential days.
    sorted_days = sorted(docTfIdf.keys())
    with open(sys.argv[4], "w") as f:
        # -1 because each goes +1
        for i in xrange(0, len(sorted_days) - 1):
            f.write("similarity(%s, %s) = " % (str(sorted_days[i]), str(sorted_days[i + 1])))
            f.write(str(vectorspace.cosineCompute(docTfIdf[sorted_days[i]], docTfIdf[sorted_days[i + 1]])) + "\n")

Пример #14

0

Показать файл

def main():

    cleanTweets = {}  # dictionary of the tweets by id as integer

    docFreq = {}  # dictionary of in how many documents the "word" appears
    invdocFreq = {}  # dictionary of the inverse document frequencies
    docTermFreq = {}  # dictionary of term frequencies by date as integer
    docTfIdf = {}  # similar to docTermFreq, but holds the tf-idf values

    # Did they provide the correct args?
    if len(sys.argv) != 3:
        usage()
        sys.exit(-1)

    # Pull lines
    with codecs.open(sys.argv[1], "r", 'utf-8') as f:
        tweets = f.readlines()

    # Pull stop words
    with open(sys.argv[2], "r") as f:
        stopwords = f.readlines()

    # clean them up!
    for i in xrange(0, len(stopwords)):
        stopwords[i] = stopwords[i].strip()

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        # Each tweet has <id>DATE-TIME</id> and <text>DATA</text>.
        #
        # So we'll have a dictionary<string, string> = {"id", "contents"}
        #
        # So, we'll just append to the end of the string for the dictionary
        # entry.
        info = tweetclean.extract_id(i)
        if info == None:
            sys.stderr.write("Invalid tweet hit\n")
            sys.exit(-1)

        # Add this tweet to the collection of clean ones.
        cleanTweets[info[0]] = tweetclean.cleanup(info[1], True, True)

    docLength = {}

    # --------------------------------------------------------------------------
    # Process the collected tweets
    for id in cleanTweets.keys():
        # Calculate Term Frequencies for this id/document.
        # Skip 1 letter words.
        # let's make a short list of the words we'll accept.
        pruned = [
            w for w in cleanTweets[id].split(' ')
            if len(w) > 1 and w not in stopwords
        ]

        # skip documents that only have one word.
        if len(pruned) < 2:
            continue

        docTermFreq[id] = {}  # Prepare the dictionary for that document.

        for w in pruned:
            try:
                docLength[id] += 1
            except KeyError:
                docLength[id] = 1

            try:
                docTermFreq[id][w] += 1
            except KeyError:
                docTermFreq[id][w] = 1

        # Contribute to the document frequencies.
        for w in docTermFreq[id]:
            try:
                docFreq[w] += 1
            except KeyError:
                docFreq[w] = 1

    # --------------------------------------------------------------------------
    # Dump how many unique terms were identified by spacing splitting.
    print "Total Count of Terms: %s" % docLength
    print "Unique Terms: %d" % len(docFreq)
    print "How many Documents: %d" % len(docTermFreq)

    # --------------------------------------------------------------------------
    # Remove singletons -- standard practice.
    # Skipped with tweets for now...

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq)

    # Calculate the tf-idf values.
    docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq)

    # --------------------------------------------------------------------------
    # Recap of everything we have stored.
    # docLength is the total count of all terms
    # cleanTweets    is the dictionary of the tweets by id as string
    # docFreq        is the dictionary of in how many documents the "word" appears
    # invdocFreq     is the dictionary of the inverse document frequencies
    # docTermFreq    is the dictionary of term frequencies by date as integer
    # docTfIdf       is similar to docTermFreq, but holds the tf-idf values

    # --------------------------------------------------------------------------
    # Build Centroid List
    centroids = []

    for doc, vec in docTfIdf.iteritems():
        centroids.append(centroid.Centroid(str(doc), vec))

    similarities = centroid.get_sims(centroids)
    average_sim = centroid.find_avg(centroids, True, similarities)
    stddev_sim = centroid.find_std(centroids, True, similarities)

    print "mean: %.10f\tstd: %.10f" % (average_sim, stddev_sim)

    # --------------------------------------------------------------------------
    # Merge centroids by highest similarity of at least threshold
    threshold = (average_sim + stddev_sim)

    while len(centroids) > 1:
        i, j, sim = centroid.find_max(centroids)

        # @warning: This is fairly crap.
        if sim >= threshold:
            centroids[i].add_centroid(centroids[j])
            del centroids[j]
            print "merged with sim: %.10f" % sim
        else:
            break

    print "len(centroids): %d" % len(centroids)
    print "avg(centroids): %.10f" % average_sim
    print "std(centroids): %.10f" % stddev_sim

    for cen in centroids:
        print centroid.topTerms(cen, 10)

Пример #15

0

Показать файл

Файл: words.py Проект: pstrinkle/thesis-source

def main():
    # Weirdly in Python, you have free access to globals from within main().

    hourlyInterval = 0  # are we building hourly or daily histograms?
    docLength = 0  # total count of all terms
    daysTweets = {}  # dictionary of the tweets by date as integer
    # dictionary of the tweets by date-hour as integer

    docFreq = {}  # dictionary of in how many documents the "word" appears
    invdocFreq = {}  # dictionary of the inverse document frequencies
    docTermFreq = {}  # dictionary of term frequencies by date as integer
    docTfIdf = {}  # similar to docTermFreq, but holds the tf-idf values

    # Did they provide the correct args?
    if len(sys.argv) != 5:
        usage()
        sys.exit(-1)

    # Parse command line
    if sys.argv[1] == "hourly":
        hourlyInterval = 1
    elif sys.argv[1] == "daily":
        pass
    else:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[2], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        # Each tweet has <created>DATE-TIME</created> and <text>DATA</text>.
        #
        # So we'll have a dictionary<string, string> = {"date", "contents"}
        #
        # So, we'll just append to the end of the string for the dictionary
        # entry.
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)

        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        if hourlyInterval:
            date = tweetdate.buildDateInt(info[0])
        else:
            date = tweetdate.buildDateDayInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])

        # Add this tweet to the collective tweet for the day.
        if date in daysTweets:
            daysTweets[date] += " " + newTweet
        else:
            daysTweets[date] = newTweet

    # End of: "for i in tweets:"
    # Thanks to python and not letting me use curly braces.

    # --------------------------------------------------------------------------
    # Process the collected tweets
    print "tweet days: %d" % len(daysTweets)

    docLength = {}

    for day in daysTweets.keys():
        docTermFreq[day] = {}  # Prepare the dictionary for that document.

        # Calculate Term Frequencies for this day/document.
        # Skip 1 letter words.
        for w in daysTweets[day].split(' '):
            if len(w) > 1:
                try:
                    docLength[day] += 1
                except KeyError:
                    docLength[day] = 1

                try:
                    docTermFreq[day][w] += 1
                except KeyError:
                    docTermFreq[day][w] = 1

        # Contribute to the document frequencies.
        for w in docTermFreq[day]:
            try:
                docFreq[w] += 1
            except KeyError:
                docFreq[w] = 1

    # --------------------------------------------------------------------------
    # Dump how many unique terms were identified by spacing splitting.
    # Dump how many days of tweets we collected.
    # For each day of tweets, dump how many unique terms were identified by space splitting.
    #
    print "sizeof documents: %s" % docLength
    print "sizeof docFreq: %d" % len(docFreq)  # this is how many unique terms
    print "sizeof docTermFreq: %d" % len(docTermFreq)  # this is how many days

    for day in docTermFreq:
        print "sizeof docTermFreq[%s]: %d" % (
            str(day), len(docTermFreq[day])
        )  # this is how many unique terms were in that day
        #print docTermFreq[day]

    # --------------------------------------------------------------------------
    # Remove singletons -- standard practice.
    # Skipped with tweets for now...

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(docTermFreq), docFreq)

    # Calculate the tf-idf values.
    docTfIdf = vectorspace.calculate_tfidf(docLength, docTermFreq, invdocFreq)

    # Recap of everything we have stored.
    # docLength      is the total count of all terms
    # daysTweets     is the dictionary of the tweets by date as integer
    # docFreq        is the dictionary of in how many documents the "word" appears
    # invdocFreq     is the dictionary of the inverse document frequencies
    # docTermFreq    is the dictionary of term frequencies by date as integer
    # docTfIdf       is similar to docTermFreq, but holds the tf-idf values

    # Sort the lists by decreasing value and dump the information.
    # TODO: Upgrade this to print the top 15-20 or so.
    sorted_keys = sorted(docTfIdf.keys())

    print "token:weight"
    for day in sorted_keys:
        print str(day) + ":---"
        sorted_tokens = sorted(
            docTfIdf[day].items(),
            key=operator.itemgetter(1),  # (1) is value
            reverse=True)
        for k, v in sorted_tokens:
            print k + ":" + str(v)

    # Dump the matrix.
    with open(sys.argv[3], "w") as f:
        f.write(vectorspace.dumpMatrix(docFreq, docTfIdf) + "\n")

    # Computer cosine similarities between sequential days.
    sorted_days = sorted(docTfIdf.keys())
    with open(sys.argv[4], "w") as f:
        # -1 because each goes +1
        for i in xrange(0, len(sorted_days) - 1):
            f.write("similarity(%s, %s) = " %
                    (str(sorted_days[i]), str(sorted_days[i + 1])))
            f.write(
                str(
                    vectorspace.cosineCompute(docTfIdf[
                        sorted_days[i]], docTfIdf[sorted_days[i + 1]])) + "\n")

Пример #16

0

Показать файл

Файл: auto_tweets2lda.py Проект: pstrinkle/thesis-source

def main():

    # Did they provide the correct args?
    if len(sys.argv) != 5:
        usage()
        sys.exit(-1)

    database_file = sys.argv[1]
    minimum = int(sys.argv[2])
    stop_file = sys.argv[3]
    output_folder = sys.argv[4]

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.importStopWords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_collect = "select owner from tweets group by owner having count(*) >= %d;"
    query_tweets = "select id, contents as text from tweets where owner = %d;"

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row
    c = conn.cursor()

    users = []

    for row in c.execute(query_collect % (minimum)):
        users.append(row['owner'])


    # --------------------------------------------------------------------------
    # Process those tweets by user set.
    for u in users:

        users_tweets = {}
        docTermFreq = {}   # dictionary of term frequencies by date as integer
        vocab = []         # array of terms

        for row in c.execute(query_tweets % u):
            users_tweets[row['id']] = row['text']

        # ----------------------------------------------------------------------
        # Process tweets
        for id in users_tweets:

            if users_tweets[id] == None: # this happens, lol.
                continue

            users_tweets[id] = tweetclean.cleanup(users_tweets[id], True, True)
        
            # Calculate Term Frequencies for this id/document.
            # Skip 1 letter words.

            # let's make a short list of the words we'll accept.
            pruned = [w for w in users_tweets[id].split(' ') \
                      if len(w) > 1 and w not in stopwords]

            # skip documents that only have one word.
            if len(pruned) < 2:
                continue

            docTermFreq[id] = {} # Prepare the dictionary for that document.

            for w in pruned:
                try:
                    docTermFreq[id][w] += 1
                except KeyError:
                    docTermFreq[id][w] = 1

                # slow. maybe linear search? maybe switch to a sorted method?
                if w not in vocab:
                    vocab.append(w)

        vocab.sort()

        # ----------------------------------------------------------------------
        # Build the vocab.txt file
        with open(os.path.join(output_folder, "%d.vocab" % u), 'w') as f:
            f.write("\n".join(vocab))
    
        # ----------------------------------------------------------------------
        # Given the vocab array, build the document term index + counts:
        sorted_tweets = sorted(docTermFreq.keys())
        data = ""
    
        for id in docTermFreq:
            print "%d" % id
            data += "%d " % len(docTermFreq[id])
        
            for term in docTermFreq[id]:
                indx = getIndx(vocab, term)
                if indx == -1:
                    sys.exit(-1)
                data += "%d:%d " % (indx, docTermFreq[id][term])
            
            data += "\n"

        with open(os.path.join(output_folder, "%d.dat" % u), "w") as f:
            f.write(data)
    
    # end for each user.

    # --------------------------------------------------------------------------
    # Done.
    conn.close()

Пример #17

0

Показать файл

Файл: ngrams_tfidf.py Проект: pstrinkle/thesis-source

def main():

    docLength = 0 # total count of all terms
    daysTweets = {}    # dictionary of the tweets by date as integer
    invdocFreq = {}    # dictionary of the inverse document frequencies
    docFreq = {}       # dictionary of document frequencies
    daysHisto = {}     # dictionary of the n-grams by date as integer

    # Did they provide the correct args?
    if len(sys.argv) != 2:
        usage()
        sys.exit(-1)

    # Pull lines
    with open(sys.argv[1], "r") as f:
        tweets = f.readlines()

    print "tweets: %d" % len(tweets)

    # --------------------------------------------------------------------------
    # Process tweets
    for i in tweets:
        info = tweetclean.extract(i)
        if info == None:
            sys.exit(-1)
        
        # Build day string
        # This needs to return -1 on error, so I'll need to test it.
        date = tweetdate.buildDateInt(info[0])

        # Do some cleanup
        newTweet = tweetclean.cleanup(info[1])

        # Add this tweet to the collective tweet for the day.
        if date in daysTweets:
            daysTweets[date] += " " + newTweet
        else:
            daysTweets[date] = newTweet

    # End of: "for i in tweets:"
    # Thanks to python and not letting me use curly braces.

    # --------------------------------------------------------------------------
    # Process the collected tweets
    print "tweet days: %d" % len(daysTweets)
    gramSize = 3
    docLength = {}

    for day in sorted(daysTweets.keys()):
        daysHisto[day] = {} # initialize the sub-dictionary

        # This gives me values, starting at 0, growing by gramSize for length of the tweet.
        # range(0, len(daysTweets[day]), gramSize)
        # This should give you values, starting at 0 for length of the tweet.
        # range(0, len(daysTweets[day]), 1)
        #
        for j in range(0, len(daysTweets[day]), gramSize):
            # this doesn't seem to do the sliding window I was expecting but rather just chunks it.
            w = daysTweets[day][j:j + gramSize]
            
            # wu is a special format that will not screw with whitespace
            wu = "_%s_" % w
            try:
                docLength[day] += 1
            except KeyError:
                docLength[day] = 1

            try:
                daysHisto[day][wu] += 1
            except KeyError:
                daysHisto[day][wu] = 1

            try:
                docFreq[wu] += 1
            except KeyError:
                docFreq[wu] = 1

    # Calculate the inverse document frequencies.
    invdocFreq = vectorspace.calculate_invdf(len(daysHisto), docFreq)

    # Calculate the tf-idf values.
    daysHisto = vectorspace.calculate_tfidf(docLength, daysHisto, invdocFreq)

    # Dump the matrix.
    #print vectorspace.dumpMatrix(docFreq, daysHisto) + "\n"

    # Computer cosine similarities between sequential days.
    sorted_days = sorted(daysHisto.keys())
    for i in range(0, len(sorted_days) - 1):
        print "similarity(%s, %s) = " % (str(sorted_days[i]), str(sorted_days[i + 1])),
        print vectorspace.cosineCompute(daysHisto[sorted_days[i]], daysHisto[sorted_days[i + 1]])

Пример #18

0

Показать файл

Файл: auto_lda_gensim.py Проект: pstrinkle/thesis-source

def thread_main(database_file, output_folder, users, stopwords, start, cnt):
    """
    Process the users in your range!
    
    Each thread gets its own hook into the database, so they don't interfere.
    
    I could use the whole Queue thing... but I don't feel like trying to get 
    that to work as well.
    """

    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    # --------------------------------------------------------------------------
    # Process this thread's users.
    for j in xrange(start, start + cnt):
        user_id = users[j]
        print "processing: %d" % user_id
        for row in c.execute(query_tweets % user_id):
            if row['text'] is not None:
                users_tweets[row['id']] = \
                    tweetclean.cleanup(row['text'], True, True)

        # only words that are greater than one letter and not in the stopword 
        # list.
        texts = [[word for word in users_tweets[uid].split() \
                  if word not in stopwords and len(word) > 1] \
                    for uid in users_tweets]

        # ----------------------------------------------------------------------
        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) \
                          if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] \
                    for text in texts]

        dictionary = corpora.Dictionary(texts)
        # store the dictionary, for future reference
        #dictionary.save(os.path.join("lda_out", '%d.dict' % user_id))

        corpus = [dictionary.doc2bow(text) for text in texts]
        # store to disk, for later use
        #corpora.MmCorpus.serialize(
        #                           os.path.join(
        #                                        output_folder,
        #                                        '%d.mm' % user_id),
        #                           corpus)

        # ----------------------------------------------------------------------
        # is this different...
        #corpus = \
        #    corpora.MmCorpus(os.path.join(output_folder, '%d.mm' % user_id))
    
        lda = models.ldamodel.LdaModel(
                                       corpus,
                                       id2word=dictionary,
                                       chunksize=100,
                                       passes=20,
                                       num_topics=100)
        #lda.save('%d.lda' % user_id)

        # ----------------------------------------------------------------------
        topic_strings = lda.show_topics(topics= -1, formatted=True)
        # shit, they share an output_file, so they could interrupt each other.
        ### so switch to individual files...
        ###
        with open(os.path.join(output_folder, "%d.topics" % user_id), "w") as f:
            f.write("user: %d\n#topics: %d\n" % (user_id, len(topic_strings)))
            for topic in topic_strings: # could use .join
                f.write("%s\n" % str(topic))

    conn.close()

Пример #19

0

Показать файл

Файл: lda_gensim.py Проект: pstrinkle/thesis-source

def main():

    # Did they provide the correct args?
    if len(sys.argv) != 4:
        usage()
        sys.exit(-1)

    database_file = sys.argv[1]
    user_id = int(sys.argv[2])
    stop_file = sys.argv[3]

    # --------------------------------------------------------------------------
    # Pull stop words
    stopwords = tweetclean.import_stopwords(stop_file)

    # --------------------------------------------------------------------------
    # Read in the database
    query_tweets = "select id, contents as text from tweets where owner = %d;"
    users_tweets = {}

    conn = sqlite3.connect(database_file)
    conn.row_factory = sqlite3.Row

    c = conn.cursor()

    for row in c.execute(query_tweets % user_id):
        if row['text'] is not None:
            users_tweets[row['id']] = \
                tweetclean.cleanup(row['text'], True, True)

    conn.close()

    # only words that are greater than one letter and not in the stopword list.
    texts = [[word for word in users_tweets[uid].split() \
              if word not in stopwords and len(word) > 1] \
                for uid in users_tweets]

    # remove words that appear only once
    all_tokens = sum(texts, [])
    tokens_once = set(word for word in set(all_tokens) \
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text \
              if word not in tokens_once] for text in texts]

    dictionary = corpora.Dictionary(texts)
    # store the dictionary, for future reference
    dictionary.save('%d.dict' % user_id)

    corpus = [dictionary.doc2bow(text) for text in texts]
    # store to disk, for later use
    corpora.MmCorpus.serialize('%d.mm' % user_id, corpus)

    # is this different...
    corpus = corpora.MmCorpus('%d.mm' % user_id)

    model = models.ldamodel.LdaModel(corpus,
                                     id2word=dictionary,
                                     chunksize=100,
                                     passes=20,
                                     num_topics=100)
    model.save('%d.lda' % user_id)

    lda = models.ldamodel.LdaModel.load('%d.lda' % user_id)

    #lda.show_topics(topics=1, topn=1, log=False, formatted=True)
    # Unlike what the documentation might have you believe, you have to pull it
    # back as a string if you want to use it.
    topic_strings = lda.show_topics(topics=-1, formatted=True)
    print "#topics: %d" % len(topic_strings)
    for topic in topic_strings:
        print topic

Python cleanup примеры использования