Пример #1
0
def processQuery():
    
    f = open("Queries.txt")
    text = f.read()
    lines = text.splitlines()
    for i in lines:
        raw_query = i
        raw_query = raw_query.replace("\n", "")
        raw_query = raw_query.lower()
        query = re.compile(r'(\w+)', re.DOTALL).findall(raw_query)
        
        print(query)
        
        queryNum = query[0]
        #print(queryNum)
        
        query = re.compile(r'[a-z]+',re.DOTALL).findall(raw_query)
        
        query = filter(None, query)
        #print(query)
        stemmer = PorterStemmer()
        query = map(lambda word: stemmer.stem(word, 0, len(word) - 1), query)

        
        queryLen = len(query)
        #print(queryLen)

        #run the models
        okapiTF(query, queryNum)
        tfIdf(query, queryNum, queryLen)
        smoothing(query, queryNum, 'Laplace')
        smoothing(query, queryNum, 'Jelinek-Mercer')
        bm25(query, queryNum)

    print("Queries processed")
Пример #2
0
class Parser:

    # A processor for removing the commoner morphological and inflexional endings from words in English
    stemmer = None
    stopwords = []

    def __init__(self,):
        self.stemmer = PorterStemmer()
        self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]")
        for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"):
            self.stopwords += [line.strip() for line in open(file).readlines()]
        self.stopwords.append("the")

    def clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = self.p.sub(" ", string)
        string = string.lower()
        return string

    def removeStopwords(self, list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords]

    def tokenise(self, string, stem=False):
        """ break string up into tokens and stem words """
        string = self.clean(string)
        words = string.split()

        if stem:
            return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]
        else:
            return words

    def tokenize(self, string, stem=False):
        tokenise(self, string, stem=stem)
Пример #3
0
def printDocsHelper(fileT,k):
	if fileT not in stopwords:
	#stemmed words
		p = PorterStemmer()
       		fileT = p.stem(fileT, 0,len(fileT)-1) + " "
		fileT=re.sub(r'\s', '', fileT)
		print fileT
		if (len(fileT)>1) and (fileT not in stopwords):
			newDict[k].append(fileT)
Пример #4
0
class Parser:
    def __init__(self):
        self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
        self.stemmer = PorterStemmer()
        self.stopWordsList = []
        self.loadStopWords()

    '''
    words_list is an array
    '''

    def fullParse(self, words_list):
        stopped = self.removeStopWords(words_list)
        cleaned = self.cleanCaseAndPunctuation(stopped)
        stopped = self.stemWords(cleaned)
        return stopped

    def stemWords(self, words_list):
        stemmed = []
        for word in words_list:
            word = self.stemmer.stem(word, 0, len(word) - 1)
            stemmed.append(word)
        return stemmed

    def removeStopWords(self, words_list):
        non_stop_list = []
        for word in words_list:
            word = ''.join(
                filter(lambda word: word not in self.stopWordsList,
                       word.strip()))
            non_stop_list.append(word)
        return non_stop_list

    def cleanCaseAndPunctuation(self, words_list):
        clean_list = []
        for word in words_list:
            word = word.lower()
            if not word.startswith('http'):
                clean = ''.join(
                    [c for c in word if c not in self.remove_punctuation_set])
                if clean:
                    clean_list.append(clean)
        return clean_list

    def printStopWords(self):
        print "****************************************************************"
        print "                         STOP WORDS"
        print "****************************************************************"
        print self.stopWordsList

    '''
    happens on __init__
    '''

    def loadStopWords(self):
        for line in open(STOPWORDS_FILE):
            self.stopWordsList.append(line.strip())
Пример #5
0
class Parser:

    def __init__(self):
        self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
        self.stemmer = PorterStemmer()
        self.stopWordsList = []
        self.loadStopWords()


    '''
    words_list is an array
    '''
    def fullParse(self, words_list):
        stopped = self.removeStopWords(words_list)
        cleaned = self.cleanCaseAndPunctuation(stopped)
        stopped = self.stemWords(cleaned)
        return stopped

    def stemWords(self, words_list):
        stemmed = []
        for word in words_list:
            word = self.stemmer.stem(word, 0, len(word)-1)
            stemmed.append(word)
        return stemmed

    def removeStopWords(self, words_list):
        non_stop_list = []
        for word in words_list:
            word = ''.join(filter(lambda word: word not in self.stopWordsList, word.strip()))
            non_stop_list.append(word)
        return non_stop_list

    def cleanCaseAndPunctuation(self, words_list):
        clean_list = []
        for word in words_list:
            word = word.lower()
            if not word.startswith('http'):
                clean = ''.join([c for c in word if c not in self.remove_punctuation_set])
                if clean:
                    clean_list.append(clean)
        return clean_list

    def printStopWords(self):
        print "****************************************************************"
        print "                         STOP WORDS"
        print "****************************************************************"
        print self.stopWordsList


    '''
    happens on __init__
    '''
    def loadStopWords(self):
        for line in open(STOPWORDS_FILE):
            self.stopWordsList.append(line.strip())
Пример #6
0
    def process(self):
        """
        This function reads the text file and performs-
            -punctuation
            -tokenization
            -lower-casing/upper-casing / punctuation / numbers
            -stop word
            -stemming
        """
        try:
            stopWords = open(self.stopwordFile, "r").read()
            try:
                if self.writerFlag == True:
                    outFile = open(self.oFile, "w")
                stemmer = PorterStemmer()
                dataDic = {}
                translator = str.maketrans('', '', string.punctuation)
                nTranslator = str.maketrans('', '', "0123456789")
                with open(self.iFile) as f:
                    for line in f:
                        try:
                            (key, val) = line.split("\t")
                        except ValueError:
                            continue
                        stringToWrite = ""
                        val = val.translate(translator)
                        val = val.translate(nTranslator)
                        val = val.lower().strip().split(" ")
                        if self.writerFlag == True:
                            stringToWrite = "%s %s \t" % (stringToWrite,
                                                          key.upper())

                        for words in val:
                            if words.strip() not in stopWords:
                                stringToWrite = "%s %s" % (stringToWrite,
                                                           stemmer.stem(words))

                        stringToWrite = "%s \n" % (stringToWrite)
                        if self.writerFlag == False:
                            dataDic[key.strip()] = stringToWrite.strip()
                        else:
                            outFile.write(stringToWrite)
                if self.writerFlag == True:
                    outFile.close()
                else:
                    return dataDic
            except (OSError, IOError) as e:
                print("Wrong input file name or file path", e)
        except (OSError, IOError) as e:
            print("Wrong stopwords file name or file path", e)
Пример #7
0
def printDocsHelper(fileT,k):
	if fileT not in stopwords:
	#stemmed words
		p = PorterStemmer()
       		fileT = p.stem(fileT, 0,len(fileT)-1) + " "
		fileT=re.sub(r'\s', '', fileT)
		if (len(fileT)>1) and (fileT not in stopwords):
			fileT="./wordFiles/" + fileT
			FILE=open(fileT,'a')
			initFreq=checkforFrequency(k,fileT)
			if checkifWritten(fileT,k):
				FILE.write(str(fileT[12:])+ " " +str(k)+ " " +str(initFreq))	
				FILE.write("\n")
				return 1
	return 0
Пример #8
0
def printDocsHelper(fileT, k):
    if fileT not in stopwords:
        #stemmed words
        p = PorterStemmer()
        fileT = p.stem(fileT, 0, len(fileT) - 1) + " "
        fileT = re.sub(r'\s', '', fileT)
        if (len(fileT) > 1) and (fileT not in stopwords):
            fileT = "./wordFiles/" + fileT
            FILE = open(fileT, 'a')
            initFreq = checkforFrequency(k, fileT)
            if checkifWritten(fileT, k):
                FILE.write(
                    str(fileT[12:]) + " " + str(k) + " " + str(initFreq))
                FILE.write("\n")
                return 1
    return 0
Пример #9
0
def tokenize(document, stem):
    tokens = []
    p = PorterStemmer()
    for text in document.headline, document.graphic, document.text:
        # Lowercase and split on non-alphanumerics
        text = text.lower()
        text_tokens = re.split('[\W]', text)
        if stem:
            stem_tokens = []
            for t in text_tokens:
                t = p.stem(t, 0, len(t) - 1)
                stem_tokens.append(t)
            text_tokens = stem_tokens
        tokens += text_tokens

    # Remove empty strings in resulting tokens list
    tokens = list(filter(None, tokens))
    return tokens
Пример #10
0
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path):
    """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking"""
    query_tokens = tokenize(topic)
    doc_no_score = {}
    N = len(doc_id_no)

    p = PorterStemmer()

    # Calculate tf in query, and idf
    for token in query_tokens:
        qf = query_tokens.count(token)
        token_tf = ((K2 + 1)*qf) / (K2 + qf)

        # Calculate idf
        if stem:
            token = p.stem(token, 0, len(token) - 1)
        token_id = token_token_id[token]
        postings = postings_list[token_id]
        # Postings follow format: [doc_id, count]
        n_i = len(postings[::2])
        a = (N - n_i + 0.5) / (n_i + 0.5)
        token_idf = math.log(a)

        # Calculate tf for docs
        for i in range(0, len(postings), 2):
            doc_id = postings[i]
            doc_no = doc_id_no[doc_id]
            document = getDocument.retrieve_by_docno(docs_path, doc_no)

            fi = postings[i+1]
            K = K1 * ((1 - B) + B * (document.length / average_doc_length))
            doc_tf = ((K1 + 1)*fi) / (K + fi)
            score = doc_tf * token_tf * token_idf
            if doc_no in doc_no_score:
                doc_no_score[doc_no] = doc_no_score[doc_no] + score
            else:
                doc_no_score[doc_no] = score
    sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True))

    print("Calculated scores for query: {}".format(topic_id))
    return sorted_doc_no_score
Пример #11
0
            temp_string += " " + words

        punc_type_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"]
        punc_type_2 = [".",'"',"[","]","?","!","*","%","{","}","$"]

        for punc in punc_type_1:
            if punc in temp_string:
                temp_string = temp_string.replace (punc, " ")
        for punc in punc_type_2:
            if punc in temp_string:
                temp_string = temp_string.replace (punc, "")

        temp_string = temp_string.split()
        final_word_list = [x for x in temp_string if x not in stop_words]
        p = PorterStemmer()
        mid_list = [(p.stem(word, 0, len (word)-1)) for word in final_word_list]
        new_list = [x for x in mid_list if x not in stop_words]
        final_string = ''.join(" " + x for x in new_list)
        query_hashmap[key] = final_string.strip()
        #print query_hashmap[key] # printing each query after stopping and stemming
    
    model_dict = {}
    query_word_count = defaultdict(float)
    file1 = open ("file1.txt","r").readlines()
    
    for key in query_hashmap.keys():
        query = query_hashmap[key].split()
        print query
        query = map(str.lower,query)
        query_dict = {}
        help_dict = {}
Пример #12
0
def applyStem(word):
	p = PorterStemmer()
  	word = p.stem(word, 0,len(word)-1)	
	return word
Пример #13
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\\s]*').sub(
        ' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ',
                                                       email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # Other
    email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},'
                              '\">_<;%\\n\\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n')

    # Process file
    stemmer = PorterStemmer()
    processed_email = []
    for word in email_contents:
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)
        # Skip the word if it is too short
        if len(word) < 1:
            continue
        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        #
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        #
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #
        try:
            index = vocabList.index(word)
        except ValueError:
            pass
        else:
            word_indices.append(index)
        # ============================================================"
    print(' '.join(processed_email))
    # Print footer
    print('\n\n=========================')
    return word_indices
Пример #14
0
    for elements in temp:
        strg += " " + elements

    punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"]
    punc2 = [".",'"',"[","]","?","!","*","%","{","}"]
    for punc in punc1:
       if punc in strg:
          strg = strg.replace(punc," ")
    for punc in punc2:
       if punc in strg:
          strg = strg.replace(punc,"")
    
    strg = strg.split()
    finallist = [x for x in strg if x not in stop]
    p = PorterStemmer()
    midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist]
    newlist = [x for x in midlist if x not in stop]
    finalstring = ''.join(" " + x for x in newlist)
   
    queryhashmap[key] = finalstring.strip()

avgdoclen = 46.25
#avgdoclen = 46.2484394507 #zipfs avgdoclen

def calcOBM25(OBM25dict,docid,doclen,termfreq,df):
    b = 0.6 #0.2-1.0
    k = 1.6 #1.2-2.0
    idf = log(3204.0/df)
    numerator = termfreq * float(k+1.0)
    denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen)
    score = idf * (numerator/denominator)
Пример #15
0
    def querySearcher(self):
        """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations"""
        try:
            stemmer = PorterStemmer()
            preProcess = PreProcessing(False, self.iFile, "",
                                       self.stopwordFile)
            preProcessRes = preProcess.process()
            createIndex = InvertedIndexGenerator(False, preProcessRes, "")
            mainIndex = createIndex.generate()
            originalquery = self.query
            self.query = self.query.lower()
            self.query = self.query.replace('but', 'and')
            querySep = list(self.parenthetic_contents(self.query))
            res = self.queryCalculator(querySep, mainIndex, stemmer,
                                       preProcessRes)
            tempQuery = self.query
            tempQuery = tempQuery.replace('{', '')
            tempQuery = tempQuery.replace('}', '')
            tempQuery = tempQuery.replace('(', '')
            tempQuery = tempQuery.replace(')', '')
            tempQuery = tempQuery.replace('/', '')
            mapKey = {}

            quryStem = []
            for t in tempQuery.split(" "):
                quryStem.append(stemmer.stem(t))
            tempQuery = ' '.join(quryStem)

            for i, r in enumerate(res.keys()):
                mapKey["%d_%s" % (i, "firstItr")] = r
                tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr"))
            res = {**res, **mainIndex}
            andPro = tempQuery.split(" ")
            """AND operation"""
            for index, term in enumerate(andPro):
                if term == "and":
                    if andPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(andPro[index - 1], -1) == -1:
                            tempKeyFirst = andPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[andPro[index - 1]]

                        if mapKey.get(andPro[index + 1], -1) == -1:
                            tempKeySecond = andPro[index + 1]
                        else:
                            tempKeySecond = mapKey[andPro[index + 1]]

                        res["%s and %s" %
                            (andPro[index - 1], andPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s and %s" %
                                (andPro[index - 1],
                                 andPro[index + 1])][k] = res[tempKeyFirst][
                                     k] and res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s and %s" %
                            (andPro[index - 1], andPro[index + 1]),
                            "%d_%s" % (index, "secondItr"))
                        mapKey["%d_%s" %
                               (index, "secondItr")] = "%s and %s" % (
                                   andPro[index - 1], andPro[index + 1])
            """OR operation"""
            orPro = tempQuery.split(" ")
            for index, term in enumerate(orPro):
                if term == "or":
                    if orPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(orPro[index - 1], -1) == -1:
                            tempKeyFirst = orPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[orPro[index - 1]]

                        if mapKey.get(orPro[index + 1], -1) == -1:
                            tempKeySecond = orPro[index + 1]
                        else:
                            tempKeySecond = mapKey[orPro[index + 1]]

                        res["%s or %s" %
                            (orPro[index - 1], orPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s or %s" %
                                (orPro[index - 1], orPro[index + 1])][k] = res[
                                    tempKeyFirst][k] or res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s or %s" % (orPro[index - 1], orPro[index + 1]),
                            "%d_%s" % (index, "thirdItr"))
                        mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % (
                            orPro[index - 1], orPro[index + 1])
            """AND NOT, OR NOT, BUT NOT operations"""
            notPro = tempQuery.split(" ")
            for index, term in enumerate(notPro):
                if term == "not":
                    tempKeyNot = {}
                    if mapKey.get(notPro[index + 1], -1) == -1:
                        tempKeySecond = notPro[index + 1]
                    else:
                        tempKeySecond = mapKey[notPro[index + 1]]

                    for k in res[tempKeySecond].keys():
                        if not res[tempKeySecond][k] == True:
                            tempKeyNot[k] = 1
                        else:
                            tempKeyNot[k] = 0

            for index, term in enumerate(notPro):
                if term == "and":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s and not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s and not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] and tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s and not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s and not %s" % (
                                   notPro[index - 1], notPro[index + 2])

                if term == "or":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s or not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s or not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] or tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s or not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s or not %s" % (
                                   notPro[index - 1], notPro[index + 2])

            self.queryAnswer(originalquery, tempQuery, mapKey, res)
        except:
            print('The term is not present in the Documents')
Пример #16
0
            ",", "-", "=", "/", "\\", "'", ";", "^", "+", "|", ":", "<", ">",
            "`", "&", "(", ")"
        ]
        punc_type_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"]

        for punc in punc_type_1:
            if punc in temp_string:
                temp_string = temp_string.replace(punc, " ")
        for punc in punc_type_2:
            if punc in temp_string:
                temp_string = temp_string.replace(punc, "")

        temp_string = temp_string.split()
        final_word_list = [x for x in temp_string if x not in stop_words]
        p = PorterStemmer()
        mid_list = [(p.stem(word, 0,
                            len(word) - 1)) for word in final_word_list]
        new_list = [x for x in mid_list if x not in stop_words]
        final_string = ''.join(" " + x for x in new_list)
        query_hashmap[key] = final_string.strip()
        #print query_hashmap[key] # printing each query after stopping and stemming

    model_dict = {}
    query_word_count = defaultdict(float)
    file1 = open("file1.txt", "r").readlines()

    for key in query_hashmap.keys():
        query = query_hashmap[key].split()
        #print query #query with uppercase
        query = map(str.lower, query)
        print query  #query with lowercase
        query_dict = {}
    #defining punctuations to be eliminated
    punct_list_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"]

    #removing punctuations
    for punct in punct_list_1:
        if punct in key_text:
            key_text = key_text.replace(punct, " ")
    for punct in punct_list_2:
        if punct in key_text:
            key_text = key_text.replace(punct, "")

    key_text = key_text.split()
    #removing stop words
    text_wo_stop_punct = [x for x in key_text if x not in stop_word_file]
    p = PorterStemmer()
    midlist = [(p.stem(word, 0, (len(word) - 1)))
               for word in text_wo_stop_punct]
    newlist = [x for x in midlist if x not in stop_word_file]
    finaltext = ''.join(" " + x for x in newlist)
    dict_map[key] = finaltext.strip()
print "Completed stemming and stopping"

dict_word_ID_map = {}
i = 1
print "Assigning IDs to words..........please wait"
for key in dict_map.keys():
    for word in dict_map[key].split():
        if dict_word_ID_map.has_key(word):
            pass
        else:
            dict_word_ID_map[word] = i
Пример #18
0
  #defining punctuations to be eliminated
  punct_list_2 = [".",'"',"[","]","?","!","*","%","{","}","$"]

  #removing punctuations
  for punct in punct_list_1:
    if punct in key_text:
      key_text = key_text.replace (punct, " ")
  for punct in punct_list_2:
    if punct in key_text:
      key_text = key_text.replace (punct, "")

  key_text = key_text.split()
  #removing stop words
  text_wo_stop_punct = [x for x in key_text if x not in stop_word_file]
  p = PorterStemmer()
  midlist = [(p.stem (word, 0, (len (word) - 1))) for word in text_wo_stop_punct]
  newlist = [x for x in midlist if x not in stop_word_file]
  finaltext = ''.join (" " + x for x in newlist)
  dict_map[key] = finaltext.strip()
print "Completed stemming and stopping"

dict_word_ID_map = {}
i = 1
print "Assigning IDs to words..........please wait"
for key in dict_map.keys():
  for word in dict_map[key].split():
    if dict_word_ID_map.has_key (word):
      pass
    else:
      dict_word_ID_map[word] = i
      i += 1
Пример #19
0
        i.create_mat()
        #print(i.id_title)
        #print(i.token_set)
        while True: 
            st=raw_input("search karo: ")
            st=st.lower()
            l=st.split();
            main=[]
            for token in l:
                match,index=i.find_match(token)
                if match== 100:
                    main.append(token)
                else:
                    main.append(index)
                
            line = [index_porter.stem(word , 0 , len(word) - 1) for word in main]

            #print("asdadsasd",line)
            rank_list=i.rank_doc(line)

            if len(rank_list)==0:
                print("NO RESULTS FOUND")
            else:
                print("SHOWING RESULTS FOR "+ " ".join(main))
                

            for x in rank_list:
                print(x[0],x[1])
                print(i.id_title[str(x[0])][0])
                #print(str(i.id_title[str(x[0])][1]))
                h1='https://en.wikipedia.org/wiki?curid=' + str(x[0])
Пример #20
0
def applyStem(word):
    p = PorterStemmer()
    word = p.stem(word, 0, len(word) - 1)
    return word
Пример #21
0
def stemWords(tokens):
	for i in range(0,len(tokens)):
		stemmer = PorterStemmer()
		tokens[i] = stemmer.stem(tokens[i], 0, len(tokens[i])-1)
	return tokens
Пример #22
0
#!/usr/bin/env python
#encoding=utf8

from porterStemmer import PorterStemmer

if __name__ == "__main__":
    stemmer = PorterStemmer()
    word = "Keyphrases"
    result = stemmer.stem(word, 0, len(word)-1)
    print result
Пример #23
0
def parseDocs():
    global terms
    global documents
    files = os.listdir("./cacm")
    sp = open("stopWords.txt")
    stopData = sp.read()
    stopTerms = stopData.lower().split("\n")
    stopTerms = stopTerms[:len(stopTerms) - 1]
    
    filep1 = open("terms.txt", "a")
    filep2 = open("mappings.txt", "a")
    filep3 = open("documents.txt", "a")
    
    termId = 1 
    
    
    for f in files:
        fp = open("./cacm/" + f)
        documentName = f.split(".")[0]
        
        documentId = documentName.split("-")[1]
        line = fp.read()
        data = re.compile(r'.*?<pre>(.*?)</pre>', re.DOTALL).match(line).group(1)
        data = data.replace("\n", " ")
        splitword = re.compile(r'CA\d+', re.DOTALL).findall(data)[0]
        text = data.split(splitword)
        words = text[0]
        words = words.replace("CACM", " ")
        words = words.lower()
        words = re.compile(r'(\w+)', re.DOTALL).findall(words)
        
        stemmer = PorterStemmer()
        words = map(lambda word: stemmer.stem(word, 0, len(word) - 1), words)
        docLength = len(words)   
        
        global totalDocLength
        totalDocLength += docLength
        
        count = collections.Counter(words)   
        
        filep3.write(documentId + " " + documentName + " " + str(docLength) + "\n") 
        
        
           
        
        for term in words:
            if term not in stopTerms and term not in stopList:
                
                global numOfTerms
                numOfTerms += 1
                
                if term in terms:
                    #print(term)
                    attributes = terms[term]
                    #print(attributes)
                    idterm = attributes[0]
                    tf = count[term]
                    documentDetails = attributes[3]
                    latestDoc = len(documentDetails)
                    lastTermId = documentDetails[latestDoc - 1]
                    #print(latestDoc)
                    if documentId == lastTermId[0]:
                        ctf = attributes[1]
                        ctf = ctf + 1
                        df = attributes[2]
                        terms[term] = idterm, ctf, df, documents[term]
                        #print(terms[term])               
                    else:
                        documents[term] = documents[term] + [[documentId, documentName, docLength, tf]]     
                        ctf = attributes[1]
                        ctf = ctf + 1
                        df = attributes[2]
                        df = df + 1
                        terms[term] = idterm, ctf, df, documents[term]
                        #print(terms[term])               
                
                if term not in terms:
                    #print(termId)
                    ctf = 1
                    tf = count[term]
                    df = 1
                    documents[term] = [[documentId, documentName, docLength, tf]]
                    terms[term] = termId, ctf, df, documents[term]
                    
                    termId += 1
                    #print(termId)
                    #print(terms[term])
                    
                    
    
                    
    
    for key in terms:
        
        attributes = terms[key]
        key_termName = key
        key_termId = attributes[0]
        key_ctf = attributes[1]
        key_df = attributes[2]
        key_documents = attributes[3]
        
        offsetLength = len(str(key_termId)) + 1
        
        filep2.write(str(key_termId) + " ")
        
        for doc in key_documents:
            docId = doc[0]
            tf = doc[3]
            offsetLength += len(docId) + len(str(tf)) + 2  
            filep2.write(docId + " " + str(tf) + " ")
            
        
        filep2.write("\n")
        
        global offset
        filep1.write(key_termName + " " + str(key_termId) + " " + str(key_ctf) + " " + str(key_df) + " " + str(offset) + " " + str(offsetLength) + "\n")
        offset += offsetLength + 1