예제 #1
0
def indexDocument(inString, schemeDocs, schemeQuery, invIndex):
    # check scheme
    if schemeDocs == 'tfidf' or schemeDocs == 'tfc':
        # Preprocess input string into list of tokens
        tokenList = preprocess.stemWords((preprocess.removeStopwords(
            (preprocess.tokenizeText(preprocess.removeSGML(inString))))))

        # get document number and increment doc-count
        docNum = invIndex['doc-count']
        invIndex['doc-count'] += 1

        # build temporary dictionary of term frequencies for this document
        # wordDict { 'word': tf }
        wordDict = {}
        for word in tokenList:
            if word in wordDict:
                wordDict[word] += 1.0
            else:
                wordDict[word] = 1.0

        # add entries to invIndex for each word
        # increments document frequency where necessary
        for word, tf in wordDict.iteritems():
            if word in invIndex:
                invIndex[word][docList].append([docNum, tf])
                invIndex[word][df] += 1.0
            else:
                invIndex[word] = {df: 1.0, docList: [[docNum, tf]]}
        return invIndex
    else:
        sys.exit("Document weighting scheme '" + schemeDocs +
                 "' is not acceptable input. Try 'tfidf' or 'tcf'.")
예제 #2
0
def processText(text, stopwords = False, stem = False):
	tokens = removeSGML(text)
	tokens = tokenizeText(tokens)
	if stopwords:
		tokens = removeStopwords(tokens)
	if stem:
		tokens = stemWords(tokens)
	return tokens
예제 #3
0
def retrieveDocuments(query, invertedIndex, weightingDoc, weightingQuery):
    # preprocess the query
    que = preprocess.removeSGML(query)
    que = preprocess.tokenizeText(que)
    que = preprocess.removeStopwords(que)
    que = preprocess.stemWords(que)
    del que[0]
    # decide the set of documents each of which contains at least 1 tokens in query
    tfque = {}
    docSet = set()
    for token in que:
        if token not in invertedIndex.keys():
            continue
        if token not in tfque:
            tfque[token] = 0
        tfque[token] += 1
        for pair in invertedIndex[token]:
            docSet.add(pair[0])
    queList = tfque.keys()
    relDoc = {}
    # tfidf.tfidf
    if weightingDoc == "tfidf" and weightingQuery == "tfidf":
        docWeight, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    #tfidf.bpx
    elif weightingDoc == "tfidf" and weightingQuery == "bpx":
        docWeight, queWeight_f = cal_tfidf(queList, tfque, docSet,
                                           invertedIndex)
        queWeight = cal_bpx(queList, tfque, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in docWeight[docID]:
                relDoc[docID] += tf * queWeight[idx]
    #nxx.tfidf
    elif weightingDoc == "nxx" and weightingQuery == "tfidf":
        docWeight_f, queWeight = cal_tfidf(queList, tfque, docSet,
                                           invertedIndex)
        docWeight = cal_nxx(queList, docSet, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    #nxx.bpx
    elif weightingDoc == "nxx" and weightingQuery == "bpx":
        docWeight = cal_nxx(queList, docSet, invertedIndex)
        queWeight = cal_bpx(queList, tfque, invertedIndex)
        for docID in docWeight.keys():
            relDoc[docID] = 0
            for idx, tf in enumerate(docWeight[docID]):
                relDoc[docID] += tf * queWeight[idx]
    else:
        print "Weighting scheme for doc is [tfidf, nxx], for query is [tfidf, bpx]"
        quit()
    return relDoc
예제 #4
0
def getAllTokens(senator_tweet_text):
    tokens = preprocess.tokenizeText(senator_tweet_text)
    tokens = preprocess.removeStopwords(tokens)
    # We decided to remove all 1-character words b/c they do not contain meaning
    tokens = [t for t in tokens if len(t) > 1]
    for token in tokens:
        # account for common internet slang
        if token == 'w/':
            token = 'with'
    return tokens
예제 #5
0
def alternative_method():

    import preprocess as pr
    pr.initStopWords('stopwords')

    pr.stemWords(
        pr.removeStopwords(pr.tokenizeText(
            open('held_out_tweets.txt').read())))

    dictionary = extract_dictionary('tweets.txt')
    X = extract_feature_vectors('tweets.txt', dictionary)
    y = read_vector_file('labels.txt')
예제 #6
0
def indexDocument(document, weightingDoc, weightingQuery, invertedIndex):
    # preprocess the content provided as input
    texts = preprocess.removeSGML(document)
    texts = preprocess.tokenizeText(texts)
    texts = preprocess.removeStopwords(texts)
    texts = preprocess.stemWords(texts)
    # add the tokens to the inverted index provided as input and calculate teh numbers necessary to calculate the weights for the given weighting schemes
    docID = texts[0]
    tf = {}
    for word in texts:
        if word not in tf:
            tf[word] = 0
        tf[word] += 1
    for word in tf.keys():
        if word not in invertedIndex:
            invertedIndex[word] = []
        invertedIndex[word].append((docID, tf[word]))
def createSentenceObjects(sentences):
    sentenceObjects = []
    # get stopwords
    s = open("stopwords", "r")
    stopwords = s.read().split()
    # iterate through the list of sentences
    for index, sentence in enumerate(sentences):
        # Tokenize sentence
        tokens = tokenizeText(sentence)
        # Remove stopwords from sentence tokens
        tokens = removeStopwords(tokens, stopwords)
        # Stem the tokens of the sentence
        stemmed = stemWords(tokens)
        # Remove punctuations
        stemmed = removePunks(stemmed)
        # Create ourSentence object and append to list of ourSentence objects
        sentenceObjects.append(ourSentence(sentence, stemmed, index))
    # Return the list of ourSentence objects
    return sentenceObjects
예제 #8
0
def indexDocument(tweet, celeb, invertedIndex, docLengths):
    tokens = pro.tokenizeText(tweet)
    noStops = pro.removeStopwords(tokens)
    #stems = pro.stemWords(tokens)

    if celeb not in docLengths:
        docLengths[celeb] = 0

    for term in noStops:
        docLengths[celeb] += 1
        if term not in invertedIndex:
            invertedIndex[term] = []
            invertedIndex[term].append(1)
            invertedIndex[term].append({})
            invertedIndex[term][1][celeb] = 1
        elif celeb not in invertedIndex[term][1]:
            invertedIndex[term][0] += 1
            invertedIndex[term][1][celeb] = 1
        elif celeb in invertedIndex[term][1]:
            invertedIndex[term][1][celeb] += 1

    return invertedIndex, docLengths
예제 #9
0
def retrieveDocuments(query, invIndex, schemeDocs, schemeQuery):
    # Preprocess query into list of tokens
    tokenList = preprocess.stemWords((preprocess.removeStopwords(
        (preprocess.tokenizeText(preprocess.removeSGML(query))))))

    # get query term frequencies
    queryTermFreq = {}
    for word in tokenList:
        # only include words that appear in at least one document
        if word in invIndex:
            if word in queryTermFreq:
                queryTermFreq[word] += 1.0
            else:
                queryTermFreq[word] = 1.0

    # get query length, (query term normalization)
    queryLength = 0.0
    for word in queryTermFreq:
        if word in invIndex:
            queryLength += math.pow(invIndex[word][idf] * queryTermFreq[word],
                                    2)
    queryLength = math.sqrt(queryLength)

    # first scheme set is tfidf.tfidf with no normalization
    if schemeQuery == 'tfidf' and schemeDocs == schemeQuery:
        # create similarity score dictionary -> maps relevant docs to similarity score
        # first step is to create the numerator (dot product), then divide all terms by denominator (normalization)
        # using tfc method for query and document
        simScores = {}
        # iterate over each word
        for word in queryTermFreq:
            # and each document that contains that word
            for docNum, tf in invIndex[word][docList]:
                if docNum in simScores:
                    simScores[docNum] += (queryTermFreq[word] * tf *
                                          math.pow(invIndex[word][idf], 2))
                else:
                    simScores[docNum] = (queryTermFreq[word] * tf *
                                         math.pow(invIndex[word][idf], 2))

        # divide each dot product by normalization factor -- APARENTLY DO NOT DO THIS?!?!?
        # REMOVED --
        # for doc in simScores:
        # 	simScores[doc] = simScores[doc] / (queryLength * docLengths[doc])

        # return the simScore dictionary
        return simScores

        # create simScoresList
        # simScoresList = []
        # for docNum, score in simScores.iteritems():
        # 	simScoresList.append([docNum, score])
        # simScoresList.sort(key=lambda scores: scores[1], reverse=True)

    # second scheme is tfc.nfx
    elif schemeDocs == 'tfc' and schemeQuery == 'nfx':
        # get max term frequency in query
        queryMaxTF = 0
        for word, tf in queryTermFreq.iteritems():
            if tf > queryMaxTF:
                queryMaxTF = tf

        simScores = {}

        # iterate over each word in query and each doc that contains those words
        for word in queryTermFreq:
            for docNum, tf in invIndex[word][docList]:
                if docNum in simScores:
                    simScores[docNum] += (
                        tf * math.pow(invIndex[word][idf], 2) *
                        (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF)))
                else:
                    simScores[docNum] = (
                        tf * math.pow(invIndex[word][idf], 2) *
                        (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF)))

        # normalize using document length (tfc scheme for doc)
        for doc in simScores:
            simScores[doc] = simScores[doc] / docLengths[doc]

        return simScores
예제 #10
0
import preprocess as pr
pr.initStopWords('stopwords')
pr.stemWords(
    pr.removeStopwords(pr.tokenizeText(open('held_out_tweets.txt').read())))
def processDoc(doc_string):
    token_doc = preprocess.stemWords(preprocess.removeStopwords(preprocess.tokenizeText(doc_string)))
    return token_doc
예제 #12
0
                json_data = open(directory + '/' + subreddit + '/' + my_file)
                commentThread = json.load(json_data)
                commentCount = 0
                readComments(commentThread[1]['data']['children'],
                             commentCount, subreddit.lower(), comment_corpus,
                             user_links, comment_counts)
            except ValueError:
                continue
        #print('comments: ' + str(comment_counts[subreddit]))

tokenized_comments = {}
print('Tokenizing text...')
for key, value in comment_corpus.items():
    print(key)
    tokenizedText = tokeText(value)
    removedStopWords = removeStopwords(tokenizedText)
    tokenized_comments[key] = tokenizedText

json_out = open('comments.json', 'w')
json.dump(tokenized_comments, json_out)
json_out.close()

# directLinks = open("directLinks.txt", "w")
# indirectLinks = open("indirectLinks.txt", "w")
# for key, value in tokenized_comments.items():
#   for it in value:
#     try:
#       temp = re.search('(?<=\/r\/)[^\/]+', it)
#       subr = temp.group(0)
#       if subr != key and tokenized_comments.has_key(subr):
#         directLinks.write(key + ' ' + subr + '\n')