def indexDocument(inString, schemeDocs, schemeQuery, invIndex): # check scheme if schemeDocs == 'tfidf' or schemeDocs == 'tfc': # Preprocess input string into list of tokens tokenList = preprocess.stemWords((preprocess.removeStopwords( (preprocess.tokenizeText(preprocess.removeSGML(inString)))))) # get document number and increment doc-count docNum = invIndex['doc-count'] invIndex['doc-count'] += 1 # build temporary dictionary of term frequencies for this document # wordDict { 'word': tf } wordDict = {} for word in tokenList: if word in wordDict: wordDict[word] += 1.0 else: wordDict[word] = 1.0 # add entries to invIndex for each word # increments document frequency where necessary for word, tf in wordDict.iteritems(): if word in invIndex: invIndex[word][docList].append([docNum, tf]) invIndex[word][df] += 1.0 else: invIndex[word] = {df: 1.0, docList: [[docNum, tf]]} return invIndex else: sys.exit("Document weighting scheme '" + schemeDocs + "' is not acceptable input. Try 'tfidf' or 'tcf'.")
def processText(text, stopwords = False, stem = False): tokens = removeSGML(text) tokens = tokenizeText(tokens) if stopwords: tokens = removeStopwords(tokens) if stem: tokens = stemWords(tokens) return tokens
def retrieveDocuments(query, invertedIndex, weightingDoc, weightingQuery): # preprocess the query que = preprocess.removeSGML(query) que = preprocess.tokenizeText(que) que = preprocess.removeStopwords(que) que = preprocess.stemWords(que) del que[0] # decide the set of documents each of which contains at least 1 tokens in query tfque = {} docSet = set() for token in que: if token not in invertedIndex.keys(): continue if token not in tfque: tfque[token] = 0 tfque[token] += 1 for pair in invertedIndex[token]: docSet.add(pair[0]) queList = tfque.keys() relDoc = {} # tfidf.tfidf if weightingDoc == "tfidf" and weightingQuery == "tfidf": docWeight, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] #tfidf.bpx elif weightingDoc == "tfidf" and weightingQuery == "bpx": docWeight, queWeight_f = cal_tfidf(queList, tfque, docSet, invertedIndex) queWeight = cal_bpx(queList, tfque, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in docWeight[docID]: relDoc[docID] += tf * queWeight[idx] #nxx.tfidf elif weightingDoc == "nxx" and weightingQuery == "tfidf": docWeight_f, queWeight = cal_tfidf(queList, tfque, docSet, invertedIndex) docWeight = cal_nxx(queList, docSet, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] #nxx.bpx elif weightingDoc == "nxx" and weightingQuery == "bpx": docWeight = cal_nxx(queList, docSet, invertedIndex) queWeight = cal_bpx(queList, tfque, invertedIndex) for docID in docWeight.keys(): relDoc[docID] = 0 for idx, tf in enumerate(docWeight[docID]): relDoc[docID] += tf * queWeight[idx] else: print "Weighting scheme for doc is [tfidf, nxx], for query is [tfidf, bpx]" quit() return relDoc
def getAllTokens(senator_tweet_text): tokens = preprocess.tokenizeText(senator_tweet_text) tokens = preprocess.removeStopwords(tokens) # We decided to remove all 1-character words b/c they do not contain meaning tokens = [t for t in tokens if len(t) > 1] for token in tokens: # account for common internet slang if token == 'w/': token = 'with' return tokens
def alternative_method(): import preprocess as pr pr.initStopWords('stopwords') pr.stemWords( pr.removeStopwords(pr.tokenizeText( open('held_out_tweets.txt').read()))) dictionary = extract_dictionary('tweets.txt') X = extract_feature_vectors('tweets.txt', dictionary) y = read_vector_file('labels.txt')
def indexDocument(document, weightingDoc, weightingQuery, invertedIndex): # preprocess the content provided as input texts = preprocess.removeSGML(document) texts = preprocess.tokenizeText(texts) texts = preprocess.removeStopwords(texts) texts = preprocess.stemWords(texts) # add the tokens to the inverted index provided as input and calculate teh numbers necessary to calculate the weights for the given weighting schemes docID = texts[0] tf = {} for word in texts: if word not in tf: tf[word] = 0 tf[word] += 1 for word in tf.keys(): if word not in invertedIndex: invertedIndex[word] = [] invertedIndex[word].append((docID, tf[word]))
def createSentenceObjects(sentences): sentenceObjects = [] # get stopwords s = open("stopwords", "r") stopwords = s.read().split() # iterate through the list of sentences for index, sentence in enumerate(sentences): # Tokenize sentence tokens = tokenizeText(sentence) # Remove stopwords from sentence tokens tokens = removeStopwords(tokens, stopwords) # Stem the tokens of the sentence stemmed = stemWords(tokens) # Remove punctuations stemmed = removePunks(stemmed) # Create ourSentence object and append to list of ourSentence objects sentenceObjects.append(ourSentence(sentence, stemmed, index)) # Return the list of ourSentence objects return sentenceObjects
def indexDocument(tweet, celeb, invertedIndex, docLengths): tokens = pro.tokenizeText(tweet) noStops = pro.removeStopwords(tokens) #stems = pro.stemWords(tokens) if celeb not in docLengths: docLengths[celeb] = 0 for term in noStops: docLengths[celeb] += 1 if term not in invertedIndex: invertedIndex[term] = [] invertedIndex[term].append(1) invertedIndex[term].append({}) invertedIndex[term][1][celeb] = 1 elif celeb not in invertedIndex[term][1]: invertedIndex[term][0] += 1 invertedIndex[term][1][celeb] = 1 elif celeb in invertedIndex[term][1]: invertedIndex[term][1][celeb] += 1 return invertedIndex, docLengths
def retrieveDocuments(query, invIndex, schemeDocs, schemeQuery): # Preprocess query into list of tokens tokenList = preprocess.stemWords((preprocess.removeStopwords( (preprocess.tokenizeText(preprocess.removeSGML(query)))))) # get query term frequencies queryTermFreq = {} for word in tokenList: # only include words that appear in at least one document if word in invIndex: if word in queryTermFreq: queryTermFreq[word] += 1.0 else: queryTermFreq[word] = 1.0 # get query length, (query term normalization) queryLength = 0.0 for word in queryTermFreq: if word in invIndex: queryLength += math.pow(invIndex[word][idf] * queryTermFreq[word], 2) queryLength = math.sqrt(queryLength) # first scheme set is tfidf.tfidf with no normalization if schemeQuery == 'tfidf' and schemeDocs == schemeQuery: # create similarity score dictionary -> maps relevant docs to similarity score # first step is to create the numerator (dot product), then divide all terms by denominator (normalization) # using tfc method for query and document simScores = {} # iterate over each word for word in queryTermFreq: # and each document that contains that word for docNum, tf in invIndex[word][docList]: if docNum in simScores: simScores[docNum] += (queryTermFreq[word] * tf * math.pow(invIndex[word][idf], 2)) else: simScores[docNum] = (queryTermFreq[word] * tf * math.pow(invIndex[word][idf], 2)) # divide each dot product by normalization factor -- APARENTLY DO NOT DO THIS?!?!? # REMOVED -- # for doc in simScores: # simScores[doc] = simScores[doc] / (queryLength * docLengths[doc]) # return the simScore dictionary return simScores # create simScoresList # simScoresList = [] # for docNum, score in simScores.iteritems(): # simScoresList.append([docNum, score]) # simScoresList.sort(key=lambda scores: scores[1], reverse=True) # second scheme is tfc.nfx elif schemeDocs == 'tfc' and schemeQuery == 'nfx': # get max term frequency in query queryMaxTF = 0 for word, tf in queryTermFreq.iteritems(): if tf > queryMaxTF: queryMaxTF = tf simScores = {} # iterate over each word in query and each doc that contains those words for word in queryTermFreq: for docNum, tf in invIndex[word][docList]: if docNum in simScores: simScores[docNum] += ( tf * math.pow(invIndex[word][idf], 2) * (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF))) else: simScores[docNum] = ( tf * math.pow(invIndex[word][idf], 2) * (0.5 + (0.5 * queryTermFreq[word] / queryMaxTF))) # normalize using document length (tfc scheme for doc) for doc in simScores: simScores[doc] = simScores[doc] / docLengths[doc] return simScores
import preprocess as pr pr.initStopWords('stopwords') pr.stemWords( pr.removeStopwords(pr.tokenizeText(open('held_out_tweets.txt').read())))
def processDoc(doc_string): token_doc = preprocess.stemWords(preprocess.removeStopwords(preprocess.tokenizeText(doc_string))) return token_doc
json_data = open(directory + '/' + subreddit + '/' + my_file) commentThread = json.load(json_data) commentCount = 0 readComments(commentThread[1]['data']['children'], commentCount, subreddit.lower(), comment_corpus, user_links, comment_counts) except ValueError: continue #print('comments: ' + str(comment_counts[subreddit])) tokenized_comments = {} print('Tokenizing text...') for key, value in comment_corpus.items(): print(key) tokenizedText = tokeText(value) removedStopWords = removeStopwords(tokenizedText) tokenized_comments[key] = tokenizedText json_out = open('comments.json', 'w') json.dump(tokenized_comments, json_out) json_out.close() # directLinks = open("directLinks.txt", "w") # indirectLinks = open("indirectLinks.txt", "w") # for key, value in tokenized_comments.items(): # for it in value: # try: # temp = re.search('(?<=\/r\/)[^\/]+', it) # subr = temp.group(0) # if subr != key and tokenized_comments.has_key(subr): # directLinks.write(key + ' ' + subr + '\n')