示例#1
0
def processQuery():
    
    f = open("Queries.txt")
    text = f.read()
    lines = text.splitlines()
    for i in lines:
        raw_query = i
        raw_query = raw_query.replace("\n", "")
        raw_query = raw_query.lower()
        query = re.compile(r'(\w+)', re.DOTALL).findall(raw_query)
        
        print(query)
        
        queryNum = query[0]
        #print(queryNum)
        
        query = re.compile(r'[a-z]+',re.DOTALL).findall(raw_query)
        
        query = filter(None, query)
        #print(query)
        stemmer = PorterStemmer()
        query = map(lambda word: stemmer.stem(word, 0, len(word) - 1), query)

        
        queryLen = len(query)
        #print(queryLen)

        #run the models
        okapiTF(query, queryNum)
        tfIdf(query, queryNum, queryLen)
        smoothing(query, queryNum, 'Laplace')
        smoothing(query, queryNum, 'Jelinek-Mercer')
        bm25(query, queryNum)

    print("Queries processed")
示例#2
0
def printDocsHelper(fileT,k):
	if fileT not in stopwords:
	#stemmed words
		p = PorterStemmer()
       		fileT = p.stem(fileT, 0,len(fileT)-1) + " "
		fileT=re.sub(r'\s', '', fileT)
		print fileT
		if (len(fileT)>1) and (fileT not in stopwords):
			newDict[k].append(fileT)
示例#3
0
    def process(self):
        """
        This function reads the text file and performs-
            -punctuation
            -tokenization
            -lower-casing/upper-casing / punctuation / numbers
            -stop word
            -stemming
        """
        try:
            stopWords = open(self.stopwordFile, "r").read()
            try:
                if self.writerFlag == True:
                    outFile = open(self.oFile, "w")
                stemmer = PorterStemmer()
                dataDic = {}
                translator = str.maketrans('', '', string.punctuation)
                nTranslator = str.maketrans('', '', "0123456789")
                with open(self.iFile) as f:
                    for line in f:
                        try:
                            (key, val) = line.split("\t")
                        except ValueError:
                            continue
                        stringToWrite = ""
                        val = val.translate(translator)
                        val = val.translate(nTranslator)
                        val = val.lower().strip().split(" ")
                        if self.writerFlag == True:
                            stringToWrite = "%s %s \t" % (stringToWrite,
                                                          key.upper())

                        for words in val:
                            if words.strip() not in stopWords:
                                stringToWrite = "%s %s" % (stringToWrite,
                                                           stemmer.stem(words))

                        stringToWrite = "%s \n" % (stringToWrite)
                        if self.writerFlag == False:
                            dataDic[key.strip()] = stringToWrite.strip()
                        else:
                            outFile.write(stringToWrite)
                if self.writerFlag == True:
                    outFile.close()
                else:
                    return dataDic
            except (OSError, IOError) as e:
                print("Wrong input file name or file path", e)
        except (OSError, IOError) as e:
            print("Wrong stopwords file name or file path", e)
示例#4
0
def printDocsHelper(fileT,k):
	if fileT not in stopwords:
	#stemmed words
		p = PorterStemmer()
       		fileT = p.stem(fileT, 0,len(fileT)-1) + " "
		fileT=re.sub(r'\s', '', fileT)
		if (len(fileT)>1) and (fileT not in stopwords):
			fileT="./wordFiles/" + fileT
			FILE=open(fileT,'a')
			initFreq=checkforFrequency(k,fileT)
			if checkifWritten(fileT,k):
				FILE.write(str(fileT[12:])+ " " +str(k)+ " " +str(initFreq))	
				FILE.write("\n")
				return 1
	return 0
示例#5
0
def printDocsHelper(fileT, k):
    if fileT not in stopwords:
        #stemmed words
        p = PorterStemmer()
        fileT = p.stem(fileT, 0, len(fileT) - 1) + " "
        fileT = re.sub(r'\s', '', fileT)
        if (len(fileT) > 1) and (fileT not in stopwords):
            fileT = "./wordFiles/" + fileT
            FILE = open(fileT, 'a')
            initFreq = checkforFrequency(k, fileT)
            if checkifWritten(fileT, k):
                FILE.write(
                    str(fileT[12:]) + " " + str(k) + " " + str(initFreq))
                FILE.write("\n")
                return 1
    return 0
示例#6
0
class Parser:

    # A processor for removing the commoner morphological and inflexional endings from words in English
    stemmer = None
    stopwords = []

    def __init__(self,):
        self.stemmer = PorterStemmer()
        self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]")
        for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"):
            self.stopwords += [line.strip() for line in open(file).readlines()]
        self.stopwords.append("the")

    def clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = self.p.sub(" ", string)
        string = string.lower()
        return string

    def removeStopwords(self, list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords]

    def tokenise(self, string, stem=False):
        """ break string up into tokens and stem words """
        string = self.clean(string)
        words = string.split()

        if stem:
            return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]
        else:
            return words

    def tokenize(self, string, stem=False):
        tokenise(self, string, stem=stem)
示例#7
0
class Parser:
    def __init__(self):
        self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
        self.stemmer = PorterStemmer()
        self.stopWordsList = []
        self.loadStopWords()

    '''
    words_list is an array
    '''

    def fullParse(self, words_list):
        stopped = self.removeStopWords(words_list)
        cleaned = self.cleanCaseAndPunctuation(stopped)
        stopped = self.stemWords(cleaned)
        return stopped

    def stemWords(self, words_list):
        stemmed = []
        for word in words_list:
            word = self.stemmer.stem(word, 0, len(word) - 1)
            stemmed.append(word)
        return stemmed

    def removeStopWords(self, words_list):
        non_stop_list = []
        for word in words_list:
            word = ''.join(
                filter(lambda word: word not in self.stopWordsList,
                       word.strip()))
            non_stop_list.append(word)
        return non_stop_list

    def cleanCaseAndPunctuation(self, words_list):
        clean_list = []
        for word in words_list:
            word = word.lower()
            if not word.startswith('http'):
                clean = ''.join(
                    [c for c in word if c not in self.remove_punctuation_set])
                if clean:
                    clean_list.append(clean)
        return clean_list

    def printStopWords(self):
        print "****************************************************************"
        print "                         STOP WORDS"
        print "****************************************************************"
        print self.stopWordsList

    '''
    happens on __init__
    '''

    def loadStopWords(self):
        for line in open(STOPWORDS_FILE):
            self.stopWordsList.append(line.strip())
示例#8
0
def tokenize(document, stem):
    tokens = []
    p = PorterStemmer()
    for text in document.headline, document.graphic, document.text:
        # Lowercase and split on non-alphanumerics
        text = text.lower()
        text_tokens = re.split('[\W]', text)
        if stem:
            stem_tokens = []
            for t in text_tokens:
                t = p.stem(t, 0, len(t) - 1)
                stem_tokens.append(t)
            text_tokens = stem_tokens
        tokens += text_tokens

    # Remove empty strings in resulting tokens list
    tokens = list(filter(None, tokens))
    return tokens
示例#9
0
class Parser:

    def __init__(self):
        self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
        self.stemmer = PorterStemmer()
        self.stopWordsList = []
        self.loadStopWords()


    '''
    words_list is an array
    '''
    def fullParse(self, words_list):
        stopped = self.removeStopWords(words_list)
        cleaned = self.cleanCaseAndPunctuation(stopped)
        stopped = self.stemWords(cleaned)
        return stopped

    def stemWords(self, words_list):
        stemmed = []
        for word in words_list:
            word = self.stemmer.stem(word, 0, len(word)-1)
            stemmed.append(word)
        return stemmed

    def removeStopWords(self, words_list):
        non_stop_list = []
        for word in words_list:
            word = ''.join(filter(lambda word: word not in self.stopWordsList, word.strip()))
            non_stop_list.append(word)
        return non_stop_list

    def cleanCaseAndPunctuation(self, words_list):
        clean_list = []
        for word in words_list:
            word = word.lower()
            if not word.startswith('http'):
                clean = ''.join([c for c in word if c not in self.remove_punctuation_set])
                if clean:
                    clean_list.append(clean)
        return clean_list

    def printStopWords(self):
        print "****************************************************************"
        print "                         STOP WORDS"
        print "****************************************************************"
        print self.stopWordsList


    '''
    happens on __init__
    '''
    def loadStopWords(self):
        for line in open(STOPWORDS_FILE):
            self.stopWordsList.append(line.strip())
示例#10
0
文件: bm25.py 项目: rfarmaha/MSCI_541
def calculate_bm25(topic_id, topic, token_token_id, postings_list, doc_id_no, average_doc_length, stem, docs_path):
    """Calculates BM25 for a topic against all LATimes Documents, returns ordered dictionary of doc_no to ranking"""
    query_tokens = tokenize(topic)
    doc_no_score = {}
    N = len(doc_id_no)

    p = PorterStemmer()

    # Calculate tf in query, and idf
    for token in query_tokens:
        qf = query_tokens.count(token)
        token_tf = ((K2 + 1)*qf) / (K2 + qf)

        # Calculate idf
        if stem:
            token = p.stem(token, 0, len(token) - 1)
        token_id = token_token_id[token]
        postings = postings_list[token_id]
        # Postings follow format: [doc_id, count]
        n_i = len(postings[::2])
        a = (N - n_i + 0.5) / (n_i + 0.5)
        token_idf = math.log(a)

        # Calculate tf for docs
        for i in range(0, len(postings), 2):
            doc_id = postings[i]
            doc_no = doc_id_no[doc_id]
            document = getDocument.retrieve_by_docno(docs_path, doc_no)

            fi = postings[i+1]
            K = K1 * ((1 - B) + B * (document.length / average_doc_length))
            doc_tf = ((K1 + 1)*fi) / (K + fi)
            score = doc_tf * token_tf * token_idf
            if doc_no in doc_no_score:
                doc_no_score[doc_no] = doc_no_score[doc_no] + score
            else:
                doc_no_score[doc_no] = score
    sorted_doc_no_score = OrderedDict(sorted(doc_no_score.items(), key=lambda t: t[1], reverse=True))

    print("Calculated scores for query: {}".format(topic_id))
    return sorted_doc_no_score
示例#11
0
def build_word_index(doc_dict):
    '''
    this method builds the word index dictionary
    '''
    dictionary={}
    stopWords = set(stopwords.words('english'))
    p=PorterStemmer() 
    global idx
    term_frequecy_list=[]
    def append_to_word_list(text,doc_index):
        global idx
        #text = " ".join(re.findall("[a-zA-Z]+", st)).lower()
        text=" ".join(re.findall("[a-zA-Z]+", text))
        text=set(text.split(" "))
        text=list(text)
        text.sort()
        temp_list=[]
        f_dt={}
        for word in text:
            if(word!=""):
                if word in stopWords:
                    continue
                else:
                    word=p.stem(word, 0,len(word)-1)
                    #update frequency of term
                    if word not in f_dt:
                        f_dt[word]=1
                    else:
                        f_dt[word]+=1
                    #check if word in dictionary and append it
                    if word not in dictionary:
                        dictionary[word]=idx
                        idx+=1
                    term_frequecy_list.append([dictionary[word],doc_index,f_dt[word]])    
                    #wordlist.append(word)
                    
    
    idx=1      
    for i in range(1,len(doc_dict)+1):
        if(doc_dict[i][1]!=''):
            append_to_word_list(doc_dict[i][1],i)
    
    return dictionary,term_frequecy_list
示例#12
0
    def querySearcher(self):
        """This is the main function which performs the AND, OR, AND NOT, BUT NOT and OR NOT operations"""
        try:
            stemmer = PorterStemmer()
            preProcess = PreProcessing(False, self.iFile, "",
                                       self.stopwordFile)
            preProcessRes = preProcess.process()
            createIndex = InvertedIndexGenerator(False, preProcessRes, "")
            mainIndex = createIndex.generate()
            originalquery = self.query
            self.query = self.query.lower()
            self.query = self.query.replace('but', 'and')
            querySep = list(self.parenthetic_contents(self.query))
            res = self.queryCalculator(querySep, mainIndex, stemmer,
                                       preProcessRes)
            tempQuery = self.query
            tempQuery = tempQuery.replace('{', '')
            tempQuery = tempQuery.replace('}', '')
            tempQuery = tempQuery.replace('(', '')
            tempQuery = tempQuery.replace(')', '')
            tempQuery = tempQuery.replace('/', '')
            mapKey = {}

            quryStem = []
            for t in tempQuery.split(" "):
                quryStem.append(stemmer.stem(t))
            tempQuery = ' '.join(quryStem)

            for i, r in enumerate(res.keys()):
                mapKey["%d_%s" % (i, "firstItr")] = r
                tempQuery = tempQuery.replace(r, "%d_%s" % (i, "firstItr"))
            res = {**res, **mainIndex}
            andPro = tempQuery.split(" ")
            """AND operation"""
            for index, term in enumerate(andPro):
                if term == "and":
                    if andPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(andPro[index - 1], -1) == -1:
                            tempKeyFirst = andPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[andPro[index - 1]]

                        if mapKey.get(andPro[index + 1], -1) == -1:
                            tempKeySecond = andPro[index + 1]
                        else:
                            tempKeySecond = mapKey[andPro[index + 1]]

                        res["%s and %s" %
                            (andPro[index - 1], andPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s and %s" %
                                (andPro[index - 1],
                                 andPro[index + 1])][k] = res[tempKeyFirst][
                                     k] and res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s and %s" %
                            (andPro[index - 1], andPro[index + 1]),
                            "%d_%s" % (index, "secondItr"))
                        mapKey["%d_%s" %
                               (index, "secondItr")] = "%s and %s" % (
                                   andPro[index - 1], andPro[index + 1])
            """OR operation"""
            orPro = tempQuery.split(" ")
            for index, term in enumerate(orPro):
                if term == "or":
                    if orPro[index + 1] == "not":
                        continue
                    else:
                        if mapKey.get(orPro[index - 1], -1) == -1:
                            tempKeyFirst = orPro[index - 1]
                        else:
                            tempKeyFirst = mapKey[orPro[index - 1]]

                        if mapKey.get(orPro[index + 1], -1) == -1:
                            tempKeySecond = orPro[index + 1]
                        else:
                            tempKeySecond = mapKey[orPro[index + 1]]

                        res["%s or %s" %
                            (orPro[index - 1], orPro[index + 1])] = {}
                        for k in res[tempKeyFirst].keys():
                            res["%s or %s" %
                                (orPro[index - 1], orPro[index + 1])][k] = res[
                                    tempKeyFirst][k] or res[tempKeySecond][k]
                        tempQuery = tempQuery.replace(
                            "%s or %s" % (orPro[index - 1], orPro[index + 1]),
                            "%d_%s" % (index, "thirdItr"))
                        mapKey["%d_%s" % (index, "thirdItr")] = "%s or %s" % (
                            orPro[index - 1], orPro[index + 1])
            """AND NOT, OR NOT, BUT NOT operations"""
            notPro = tempQuery.split(" ")
            for index, term in enumerate(notPro):
                if term == "not":
                    tempKeyNot = {}
                    if mapKey.get(notPro[index + 1], -1) == -1:
                        tempKeySecond = notPro[index + 1]
                    else:
                        tempKeySecond = mapKey[notPro[index + 1]]

                    for k in res[tempKeySecond].keys():
                        if not res[tempKeySecond][k] == True:
                            tempKeyNot[k] = 1
                        else:
                            tempKeyNot[k] = 0

            for index, term in enumerate(notPro):
                if term == "and":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s and not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s and not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] and tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s and not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s and not %s" % (
                                   notPro[index - 1], notPro[index + 2])

                if term == "or":
                    if mapKey.get(notPro[index - 1], -1) == -1:
                        tempKeyFirst = notPro[index - 1]
                    else:
                        tempKeyFirst = mapKey[notPro[index - 1]]

                    res["%s or not %s" %
                        (notPro[index - 1], notPro[index + 2])] = {}
                    for kee in res[tempKeyFirst].keys():
                        res["%s or not %s" %
                            (notPro[index - 1], notPro[index + 2])][kee] = res[
                                tempKeyFirst][kee] or tempKeyNot[kee]
                        tempQuery = tempQuery.replace(
                            "%s or not %s" %
                            (notPro[index - 1], notPro[index + 2]),
                            "%d_%s" % (index, "fourthItr"))
                        mapKey["%d_%s" %
                               (index, "fourthItr")] = "%s or not %s" % (
                                   notPro[index - 1], notPro[index + 2])

            self.queryAnswer(originalquery, tempQuery, mapKey, res)
        except:
            print('The term is not present in the Documents')
示例#13
0
  punct_list_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"]
  #defining punctuations to be eliminated
  punct_list_2 = [".",'"',"[","]","?","!","*","%","{","}","$"]

  #removing punctuations
  for punct in punct_list_1:
    if punct in key_text:
      key_text = key_text.replace (punct, " ")
  for punct in punct_list_2:
    if punct in key_text:
      key_text = key_text.replace (punct, "")

  key_text = key_text.split()
  #removing stop words
  text_wo_stop_punct = [x for x in key_text if x not in stop_word_file]
  p = PorterStemmer()
  midlist = [(p.stem (word, 0, (len (word) - 1))) for word in text_wo_stop_punct]
  newlist = [x for x in midlist if x not in stop_word_file]
  finaltext = ''.join (" " + x for x in newlist)
  dict_map[key] = finaltext.strip()
print "Completed stemming and stopping"

dict_word_ID_map = {}
i = 1
print "Assigning IDs to words..........please wait"
for key in dict_map.keys():
  for word in dict_map[key].split():
    if dict_word_ID_map.has_key (word):
      pass
    else:
      dict_word_ID_map[word] = i
    ]
    #defining punctuations to be eliminated
    punct_list_2 = [".", '"', "[", "]", "?", "!", "*", "%", "{", "}", "$"]

    #removing punctuations
    for punct in punct_list_1:
        if punct in key_text:
            key_text = key_text.replace(punct, " ")
    for punct in punct_list_2:
        if punct in key_text:
            key_text = key_text.replace(punct, "")

    key_text = key_text.split()
    #removing stop words
    text_wo_stop_punct = [x for x in key_text if x not in stop_word_file]
    p = PorterStemmer()
    midlist = [(p.stem(word, 0, (len(word) - 1)))
               for word in text_wo_stop_punct]
    newlist = [x for x in midlist if x not in stop_word_file]
    finaltext = ''.join(" " + x for x in newlist)
    dict_map[key] = finaltext.strip()
print "Completed stemming and stopping"

dict_word_ID_map = {}
i = 1
print "Assigning IDs to words..........please wait"
for key in dict_map.keys():
    for word in dict_map[key].split():
        if dict_word_ID_map.has_key(word):
            pass
        else:
'''

''' Kshitij is avoiding using Nltk as much as possible for now as the searches will be very slow otherwise '''

import sys
import re
import gc
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
from math import log,sqrt
import operator
from fuzzywuzzy import fuzz


index_porter = PorterStemmer()

class Index:
    N=4718
    token_set= set()                            #to store the tokens without stemming for correction
    id_title={}                                 #to store the corresponding title for docId
    def __init__(self):
        self.index = defaultdict(list)

    def remove_stop_words(self):
        file = open(self.stopwordsFile, 'rw')
        stop_words = [line.rstrip() for line in file]
        '''rstrip removes whitespace characters(by default)'''
        self.stop_words_dict = dict.fromkeys(stop_words)
        file.close()
示例#16
0
def applyStem(word):
    p = PorterStemmer()
    word = p.stem(word, 0, len(word) - 1)
    return word
示例#17
0
    strg = ""
    for elements in temp:
        strg += " " + elements

    punc1 = [",","-","=","/","'",";","^","+","|",":","<",">","`","&","(",")"]
    punc2 = [".",'"',"[","]","?","!","*","%","{","}"]
    for punc in punc1:
       if punc in strg:
          strg = strg.replace(punc," ")
    for punc in punc2:
       if punc in strg:
          strg = strg.replace(punc,"")
    
    strg = strg.split()
    finallist = [x for x in strg if x not in stop]
    p = PorterStemmer()
    midlist = [(p.stem(word, 0, len(word)-1)) for word in finallist]
    newlist = [x for x in midlist if x not in stop]
    finalstring = ''.join(" " + x for x in newlist)
   
    queryhashmap[key] = finalstring.strip()

avgdoclen = 46.25
#avgdoclen = 46.2484394507 #zipfs avgdoclen

def calcOBM25(OBM25dict,docid,doclen,termfreq,df):
    b = 0.6 #0.2-1.0
    k = 1.6 #1.2-2.0
    idf = log(3204.0/df)
    numerator = termfreq * float(k+1.0)
    denominator = termfreq + k*(1.0 - b + (b*doclen)/avgdoclen)
示例#18
0
import os
import sys
import re as reg
import math
from array import array
from collections import defaultdict
from porterStemmer import PorterStemmer
import copy

#Creating object of PorterStemmer Class
portstemmer_obj = PorterStemmer()
class CreateIndex:
	def __init__(self):
		self.mainindex = defaultdict(list)
		self.termfrequency = defaultdict(list)
		self.documentfrequency = defaultdict(int)
		self.totaldocuments = 0
		self.indexanditstitle = defaultdict(list)

	def findstopwords(self):
		stopwordsfile = open('stopwords.txt','r',encoding='UTF-8')
		stopwords=[line.rstrip() for line in stopwordsfile]
		self.stop_words = dict.fromkeys(stopwords)
		stopwordsfile.close()

	def get_important_terms(self , lines):
		#get the useful words and terms from the text
		lines = lines.lower()
		lines = reg.sub(r'[^a-z0-9 ]' , ' ' , lines)
		lines = lines.split()
		lines = [ele for ele in lines if ele not in self.stop_words]
示例#19
0
 def __init__(self):
     self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
     self.stemmer = PorterStemmer()
     self.stopWordsList = []
     self.loadStopWords()
示例#20
0
import sys
from porterStemmer import PorterStemmer
import re  # regex to rescue
from functools import reduce
from collections import defaultdict
#from indexedfile.dat import defaultdict

ps = PorterStemmer()
inverteddict = defaultdict(list)
regex = re.compile("[^a-z0-9 ]+")


def Cleaning():
    # concatenate command line arguement
    words = " ".join(sys.argv[1:])
    # converting all to lowercase to ease in searching
    words = words.lower()
    # removing non-alphanumeric except for space
    words = regex.sub(" ", words).strip()
    words = words.split()
    # remove prepositions and all stopwords
    file = open("preposition.dat").read()
    filterwords = [w for w in words if not w in file]
    filterwords = []
    for w in words:
        if w not in file:
            filterwords.append(w)
    # stem words so that text texting and texted are all considered text
    filterwords = [ps.stem(word, 0, len(word) - 1) for word in filterwords]
    filterwords = tuple(filterwords)
    return filterwords
示例#21
0
 def __init__(self,):
     self.stemmer = PorterStemmer()
     self.p = re.compile(r"&.{1,5}?;|[!-@[-`{-~]")
     for file in glob.glob(os.path.dirname(__file__) + "/stopwords/*/*.txt"):
         self.stopwords += [line.strip() for line in open(file).readlines()]
     self.stopwords.append("the")
示例#22
0
#!/usr/bin/env python
#encoding=utf8

from porterStemmer import PorterStemmer

if __name__ == "__main__":
    stemmer = PorterStemmer()
    word = "Keyphrases"
    result = stemmer.stem(word, 0, len(word)-1)
    print result
示例#23
0
def applyStem(word):
	p = PorterStemmer()
  	word = p.stem(word, 0,len(word)-1)	
	return word
示例#24
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\\s]*').sub(
        ' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ',
                                                       email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # Other
    email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},'
                              '\">_<;%\\n\\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n')

    # Process file
    stemmer = PorterStemmer()
    processed_email = []
    for word in email_contents:
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)
        # Skip the word if it is too short
        if len(word) < 1:
            continue
        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        #
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        #
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #
        try:
            index = vocabList.index(word)
        except ValueError:
            pass
        else:
            word_indices.append(index)
        # ============================================================"
    print(' '.join(processed_email))
    # Print footer
    print('\n\n=========================')
    return word_indices
示例#25
0
def stemWords(tokens):
	for i in range(0,len(tokens)):
		stemmer = PorterStemmer()
		tokens[i] = stemmer.stem(tokens[i], 0, len(tokens[i])-1)
	return tokens
示例#26
0
        for words in temp:
            temp_string += " " + words

        punc_type_1 = [",","-","=","/","\\","'",";","^","+","|",":","<",">","`","&","(",")"]
        punc_type_2 = [".",'"',"[","]","?","!","*","%","{","}","$"]

        for punc in punc_type_1:
            if punc in temp_string:
                temp_string = temp_string.replace (punc, " ")
        for punc in punc_type_2:
            if punc in temp_string:
                temp_string = temp_string.replace (punc, "")

        temp_string = temp_string.split()
        final_word_list = [x for x in temp_string if x not in stop_words]
        p = PorterStemmer()
        mid_list = [(p.stem(word, 0, len (word)-1)) for word in final_word_list]
        new_list = [x for x in mid_list if x not in stop_words]
        final_string = ''.join(" " + x for x in new_list)
        query_hashmap[key] = final_string.strip()
        #print query_hashmap[key] # printing each query after stopping and stemming
    
    model_dict = {}
    query_word_count = defaultdict(float)
    file1 = open ("file1.txt","r").readlines()
    
    for key in query_hashmap.keys():
        query = query_hashmap[key].split()
        print query
        query = map(str.lower,query)
        query_dict = {}
示例#27
0
 def __init__(self):
     self.remove_punctuation_set = set('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~')
     self.stemmer = PorterStemmer()
     self.stopWordsList = []
     self.loadStopWords()
示例#28
0
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
import copy

porter = PorterStemmer()


class QueryIndex:
    def __init__(self):
        self.index = {}
        self.titleIndex = {}
        self.tf = {}  #term frequencies
        self.idf = {}  #inverse document frequencies

    def intersectLists(self, lists):
        if len(lists) == 0:
            return []
        #start intersecting from the smaller list
        lists.sort(key=len)
        return list(reduce(lambda x, y: set(x) & set(y), lists))

    def getStopwords(self):
        f = open(self.stopwordsFile, 'r')
        stopwords = [line.rstrip() for line in f]
        self.sw = dict.fromkeys(stopwords)
        f.close()

    def getTerms(self, line):
        line = line.lower()
示例#29
0
def parseDocs():
    global terms
    global documents
    files = os.listdir("./cacm")
    sp = open("stopWords.txt")
    stopData = sp.read()
    stopTerms = stopData.lower().split("\n")
    stopTerms = stopTerms[:len(stopTerms) - 1]
    
    filep1 = open("terms.txt", "a")
    filep2 = open("mappings.txt", "a")
    filep3 = open("documents.txt", "a")
    
    termId = 1 
    
    
    for f in files:
        fp = open("./cacm/" + f)
        documentName = f.split(".")[0]
        
        documentId = documentName.split("-")[1]
        line = fp.read()
        data = re.compile(r'.*?<pre>(.*?)</pre>', re.DOTALL).match(line).group(1)
        data = data.replace("\n", " ")
        splitword = re.compile(r'CA\d+', re.DOTALL).findall(data)[0]
        text = data.split(splitword)
        words = text[0]
        words = words.replace("CACM", " ")
        words = words.lower()
        words = re.compile(r'(\w+)', re.DOTALL).findall(words)
        
        stemmer = PorterStemmer()
        words = map(lambda word: stemmer.stem(word, 0, len(word) - 1), words)
        docLength = len(words)   
        
        global totalDocLength
        totalDocLength += docLength
        
        count = collections.Counter(words)   
        
        filep3.write(documentId + " " + documentName + " " + str(docLength) + "\n") 
        
        
           
        
        for term in words:
            if term not in stopTerms and term not in stopList:
                
                global numOfTerms
                numOfTerms += 1
                
                if term in terms:
                    #print(term)
                    attributes = terms[term]
                    #print(attributes)
                    idterm = attributes[0]
                    tf = count[term]
                    documentDetails = attributes[3]
                    latestDoc = len(documentDetails)
                    lastTermId = documentDetails[latestDoc - 1]
                    #print(latestDoc)
                    if documentId == lastTermId[0]:
                        ctf = attributes[1]
                        ctf = ctf + 1
                        df = attributes[2]
                        terms[term] = idterm, ctf, df, documents[term]
                        #print(terms[term])               
                    else:
                        documents[term] = documents[term] + [[documentId, documentName, docLength, tf]]     
                        ctf = attributes[1]
                        ctf = ctf + 1
                        df = attributes[2]
                        df = df + 1
                        terms[term] = idterm, ctf, df, documents[term]
                        #print(terms[term])               
                
                if term not in terms:
                    #print(termId)
                    ctf = 1
                    tf = count[term]
                    df = 1
                    documents[term] = [[documentId, documentName, docLength, tf]]
                    terms[term] = termId, ctf, df, documents[term]
                    
                    termId += 1
                    #print(termId)
                    #print(terms[term])
                    
                    
    
                    
    
    for key in terms:
        
        attributes = terms[key]
        key_termName = key
        key_termId = attributes[0]
        key_ctf = attributes[1]
        key_df = attributes[2]
        key_documents = attributes[3]
        
        offsetLength = len(str(key_termId)) + 1
        
        filep2.write(str(key_termId) + " ")
        
        for doc in key_documents:
            docId = doc[0]
            tf = doc[3]
            offsetLength += len(docId) + len(str(tf)) + 2  
            filep2.write(docId + " " + str(tf) + " ")
            
        
        filep2.write("\n")
        
        global offset
        filep1.write(key_termName + " " + str(key_termId) + " " + str(key_ctf) + " " + str(key_df) + " " + str(offset) + " " + str(offsetLength) + "\n")
        offset += offsetLength + 1