예제 #1
0
def printDocsHelper(fileT,k):
	if fileT not in stopwords:
	#stemmed words
		p = PorterStemmer()
       		fileT = p.stem(fileT, 0,len(fileT)-1) + " "
		fileT=re.sub(r'\s', '', fileT)
		print fileT
		if (len(fileT)>1) and (fileT not in stopwords):
			newDict[k].append(fileT)
예제 #2
0
    def process(self):
        """
        This function reads the text file and performs-
            -punctuation
            -tokenization
            -lower-casing/upper-casing / punctuation / numbers
            -stop word
            -stemming
        """
        try:
            stopWords = open(self.stopwordFile, "r").read()
            try:
                if self.writerFlag == True:
                    outFile = open(self.oFile, "w")
                stemmer = PorterStemmer()
                dataDic = {}
                translator = str.maketrans('', '', string.punctuation)
                nTranslator = str.maketrans('', '', "0123456789")
                with open(self.iFile) as f:
                    for line in f:
                        try:
                            (key, val) = line.split("\t")
                        except ValueError:
                            continue
                        stringToWrite = ""
                        val = val.translate(translator)
                        val = val.translate(nTranslator)
                        val = val.lower().strip().split(" ")
                        if self.writerFlag == True:
                            stringToWrite = "%s %s \t" % (stringToWrite,
                                                          key.upper())

                        for words in val:
                            if words.strip() not in stopWords:
                                stringToWrite = "%s %s" % (stringToWrite,
                                                           stemmer.stem(words))

                        stringToWrite = "%s \n" % (stringToWrite)
                        if self.writerFlag == False:
                            dataDic[key.strip()] = stringToWrite.strip()
                        else:
                            outFile.write(stringToWrite)
                if self.writerFlag == True:
                    outFile.close()
                else:
                    return dataDic
            except (OSError, IOError) as e:
                print("Wrong input file name or file path", e)
        except (OSError, IOError) as e:
            print("Wrong stopwords file name or file path", e)
예제 #3
0
def printDocsHelper(fileT, k):
    if fileT not in stopwords:
        #stemmed words
        p = PorterStemmer()
        fileT = p.stem(fileT, 0, len(fileT) - 1) + " "
        fileT = re.sub(r'\s', '', fileT)
        if (len(fileT) > 1) and (fileT not in stopwords):
            fileT = "./wordFiles/" + fileT
            FILE = open(fileT, 'a')
            initFreq = checkforFrequency(k, fileT)
            if checkifWritten(fileT, k):
                FILE.write(
                    str(fileT[12:]) + " " + str(k) + " " + str(initFreq))
                FILE.write("\n")
                return 1
    return 0
예제 #4
0
def tokenize(document, stem):
    tokens = []
    p = PorterStemmer()
    for text in document.headline, document.graphic, document.text:
        # Lowercase and split on non-alphanumerics
        text = text.lower()
        text_tokens = re.split('[\W]', text)
        if stem:
            stem_tokens = []
            for t in text_tokens:
                t = p.stem(t, 0, len(t) - 1)
                stem_tokens.append(t)
            text_tokens = stem_tokens
        tokens += text_tokens

    # Remove empty strings in resulting tokens list
    tokens = list(filter(None, tokens))
    return tokens
예제 #5
0
def build_word_index(doc_dict):
    '''
    this method builds the word index dictionary
    '''
    dictionary={}
    stopWords = set(stopwords.words('english'))
    p=PorterStemmer() 
    global idx
    term_frequecy_list=[]
    def append_to_word_list(text,doc_index):
        global idx
        #text = " ".join(re.findall("[a-zA-Z]+", st)).lower()
        text=" ".join(re.findall("[a-zA-Z]+", text))
        text=set(text.split(" "))
        text=list(text)
        text.sort()
        temp_list=[]
        f_dt={}
        for word in text:
            if(word!=""):
                if word in stopWords:
                    continue
                else:
                    word=p.stem(word, 0,len(word)-1)
                    #update frequency of term
                    if word not in f_dt:
                        f_dt[word]=1
                    else:
                        f_dt[word]+=1
                    #check if word in dictionary and append it
                    if word not in dictionary:
                        dictionary[word]=idx
                        idx+=1
                    term_frequecy_list.append([dictionary[word],doc_index,f_dt[word]])    
                    #wordlist.append(word)
                    
    
    idx=1      
    for i in range(1,len(doc_dict)+1):
        if(doc_dict[i][1]!=''):
            append_to_word_list(doc_dict[i][1],i)
    
    return dictionary,term_frequecy_list
예제 #6
0
파일: bm25.py 프로젝트: rfarmaha/MSCI_541
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path):
    """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking"""
    query_tokens = tokenize(topic)
    doc_no_score = {}
    N = len(doc_id_no)

    p = PorterStemmer()

    # Calculate tf in query, and idf
    for token in query_tokens:
        qf = query_tokens.count(token)
        token_tf = ((K2 + 1)*qf) / (K2 + qf)

        # Calculate idf
        if stem:
            token = p.stem(token, 0, len(token) - 1)
        token_id = token_token_id[token]
        postings = postings_list[token_id]
        # Postings follow format: [doc_id, count]
        n_i = len(postings[::2])
        a = (N - n_i + 0.5) / (n_i + 0.5)
        token_idf = math.log(a)

        # Calculate tf for docs
        for i in range(0, len(postings), 2):
            doc_id = postings[i]
            doc_no = doc_id_no[doc_id]
            document = getDocument.retrieve_by_docno(docs_path, doc_no)

            fi = postings[i+1]
            K = K1 * ((1 - B) + B * (document.length / average_doc_length))
            doc_tf = ((K1 + 1)*fi) / (K + fi)
            score = doc_tf * token_tf * token_idf
            if doc_no in doc_no_score:
                doc_no_score[doc_no] = doc_no_score[doc_no] + score
            else:
                doc_no_score[doc_no] = score
    sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True))

    print("Calculated scores for query: {}".format(topic_id))
    return sorted_doc_no_score
예제 #7
0
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
import copy

porter = PorterStemmer()


class QueryIndex:
    def __init__(self):
        self.index = {}
        self.titleIndex = {}
        self.tf = {}  #term frequencies
        self.idf = {}  #inverse document frequencies

    def intersectLists(self, lists):
        if len(lists) == 0:
            return []
        #start intersecting from the smaller list
        lists.sort(key=len)
        return list(reduce(lambda x, y: set(x) & set(y), lists))

    def getStopwords(self):
        f = open(self.stopwordsFile, 'r')
        stopwords = [line.rstrip() for line in f]
        self.sw = dict.fromkeys(stopwords)
        f.close()

    def getTerms(self, line):
        line = line.lower()
예제 #8
0
    strg = ""
    for elements in temp:
        strg += " " + elements

    punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"]
    punc2 = [".",'"',"[","]","?","!","*","%","{","}"]
    for punc in punc1:
       if punc in strg:
          strg = strg.replace(punc," ")
    for punc in punc2:
       if punc in strg:
          strg = strg.replace(punc,"")
    
    strg = strg.split()
    finallist = [x for x in strg if x not in stop]
    p = PorterStemmer()
    midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist]
    newlist = [x for x in midlist if x not in stop]
    finalstring = ''.join(" " + x for x in newlist)
   
    queryhashmap[key] = finalstring.strip()

avgdoclen = 46.25
#avgdoclen = 46.2484394507 #zipfs avgdoclen

def calcOBM25(OBM25dict,docid,doclen,termfreq,df):
    b = 0.6 #0.2-1.0
    k = 1.6 #1.2-2.0
    idf = log(3204.0/df)
    numerator = termfreq * float(k+1.0)
    denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen)
예제 #9
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\\s]*').sub(
        ' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ',
                                                       email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # Other
    email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},'
                              '\">_<;%\\n\\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n')

    # Process file
    stemmer = PorterStemmer()
    processed_email = []
    for word in email_contents:
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)
        # Skip the word if it is too short
        if len(word) < 1:
            continue
        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        #
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        #
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #
        try:
            index = vocabList.index(word)
        except ValueError:
            pass
        else:
            word_indices.append(index)
        # ============================================================"
    print(' '.join(processed_email))
    # Print footer
    print('\n\n=========================')
    return word_indices
예제 #10
0
    def querySearcher(self):
        """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations"""
        try:
            stemmer = PorterStemmer()
            preProcess = PreProcessing(False, self.iFile, "",
                                       self.stopwordFile)
            preProcessRes = preProcess.process()
            createIndex = InvertedIndexGenerator(False, preProcessRes, "")
            mainIndex = createIndex.generate()
            originalquery = self.query
            self.query = self.query.lower()
            self.query = self.query.replace('but', 'and')
            querySep = list(self.parenthetic_contents(self.query))
            res = self.queryCalculator(querySep, mainIndex, stemmer,
                                       preProcessRes)
            tempQuery = self.query
            tempQuery = tempQuery.replace('{', '')
            tempQuery = tempQuery.replace('}', '')
            tempQuery = tempQuery.replace('(', '')
            tempQuery = tempQuery.replace(')', '')
            tempQuery = tempQuery.replace('/', '')
            mapKey = {}

            quryStem = []
            for t in tempQuery.split(" "):
                quryStem.append(stemmer.stem(t))
            tempQuery = ' '.join(quryStem)

            for i, r in enumerate(res.keys()):
                mapKey["%d_%s" % (i, "firstItr")] = r
                tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr"))
            res = {**res, **mainIndex}
            andPro = tempQuery.split(" ")
            """AND operation"""
            for index, term in enumerate(andPro):
                if term == "and":
                    if andPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(andPro[index - 1], -1) == -1:
                            tempKeyFirst = andPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[andPro[index - 1]]

                        if mapKey.get(andPro[index + 1], -1) == -1:
                            tempKeySecond = andPro[index + 1]
                        else:
                            tempKeySecond = mapKey[andPro[index + 1]]

                        res["%s and %s" %
                            (andPro[index - 1], andPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s and %s" %
                                (andPro[index - 1],
                                 andPro[index + 1])][k] = res[tempKeyFirst][
                                     k] and res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s and %s" %
                            (andPro[index - 1], andPro[index + 1]),
                            "%d_%s" % (index, "secondItr"))
                        mapKey["%d_%s" %
                               (index, "secondItr")] = "%s and %s" % (
                                   andPro[index - 1], andPro[index + 1])
            """OR operation"""
            orPro = tempQuery.split(" ")
            for index, term in enumerate(orPro):
                if term == "or":
                    if orPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(orPro[index - 1], -1) == -1:
                            tempKeyFirst = orPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[orPro[index - 1]]

                        if mapKey.get(orPro[index + 1], -1) == -1:
                            tempKeySecond = orPro[index + 1]
                        else:
                            tempKeySecond = mapKey[orPro[index + 1]]

                        res["%s or %s" %
                            (orPro[index - 1], orPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s or %s" %
                                (orPro[index - 1], orPro[index + 1])][k] = res[
                                    tempKeyFirst][k] or res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s or %s" % (orPro[index - 1], orPro[index + 1]),
                            "%d_%s" % (index, "thirdItr"))
                        mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % (
                            orPro[index - 1], orPro[index + 1])
            """AND NOT, OR NOT, BUT NOT operations"""
            notPro = tempQuery.split(" ")
            for index, term in enumerate(notPro):
                if term == "not":
                    tempKeyNot = {}
                    if mapKey.get(notPro[index + 1], -1) == -1:
                        tempKeySecond = notPro[index + 1]
                    else:
                        tempKeySecond = mapKey[notPro[index + 1]]

                    for k in res[tempKeySecond].keys():
                        if not res[tempKeySecond][k] == True:
                            tempKeyNot[k] = 1
                        else:
                            tempKeyNot[k] = 0

            for index, term in enumerate(notPro):
                if term == "and":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s and not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s and not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] and tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s and not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s and not %s" % (
                                   notPro[index - 1], notPro[index + 2])

                if term == "or":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s or not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s or not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] or tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s or not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s or not %s" % (
                                   notPro[index - 1], notPro[index + 2])

            self.queryAnswer(originalquery, tempQuery, mapKey, res)
        except:
            print('The term is not present in the Documents')
예제 #11
0
def applyStem(word):
    p = PorterStemmer()
    word = p.stem(word, 0, len(word) - 1)
    return word
예제 #12
0
import os
import sys
import re as reg
import math
from array import array
from collections import defaultdict
from porterStemmer import PorterStemmer
import copy

#Creating object of PorterStemmer Class
portstemmer_obj = PorterStemmer()
class CreateIndex:
	def __init__(self):
		self.mainindex = defaultdict(list)
		self.termfrequency = defaultdict(list)
		self.documentfrequency = defaultdict(int)
		self.totaldocuments = 0
		self.indexanditstitle = defaultdict(list)

	def findstopwords(self):
		stopwordsfile = open('stopwords.txt','r',encoding='UTF-8')
		stopwords=[line.rstrip() for line in stopwordsfile]
		self.stop_words = dict.fromkeys(stopwords)
		stopwordsfile.close()

	def get_important_terms(self , lines):
		#get the useful words and terms from the text
		lines = lines.lower()
		lines = reg.sub(r'[^a-z0-9 ]' , ' ' , lines)
		lines = lines.split()
		lines = [ele for ele in lines if ele not in self.stop_words]
예제 #13
0
 def __init__(self):
     self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
     self.stemmer = PorterStemmer()
     self.stopWordsList = []
     self.loadStopWords()