示例#1
0
def init_test(doc, word_list, i, docs):
    p = porter.PorterStemmer()
    infile = open(doc, 'r', encoding="ISO-8859-1")
    while 1:
        word = ''
        line = infile.readline()
        if line == '':
            break
        elif line == '\n':
            continue
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if c.isdigit():
                    continue
                elif c in string.punctuation:
                    continue
                elif word:
                    if not word in stopword:
                        word = p.stem(word, 0, len(word) - 1)
                        if word in word_list:
                            idx = word_list.index(word)
                            docs[i, idx] += 1
                        word = ''
                    else:
                        word = ''
                        continue
    infile.close()
示例#2
0
class Summarizer(object):
    """Abstract base class for all summarizers."""

    NUM_DOCS = 2
    BASE_DIR = os.path.dirname(__file__)
    stopwords = set( open(os.path.join(BASE_DIR, 'stop_words.txt'), 'r').read().strip().split(',') )
    word_re = re.compile('\w+') # drop trailing non-alphanumeric chars
    non_space = re.compile('\S+')
    non_alnum_ending = re.compile('\W$')
    punctuation = re.compile('[\-.,?!:;\'()&\[\]\$]')
    sentence_terminator = re.compile('[.?!]$')
    numbers = re.compile(r'[_\d.]+') # numbers and other strange tokens made up of underscores; re.compile(r'[\d.]*\d+')

    stemmer = porter.PorterStemmer()

    def __init__(self):
        self.df = None
        self.tf = {}

    # tf is the frequency of a word in the document being summarized;
    # df is its frequency in the document collection (all 37 plays)
    def initialize(self, tf_filename, df_filename):
        with open(os.path.join(Summarizer.BASE_DIR, df_filename), 'rb') as f:
            self.df = pickle.load(f)

        with open(tf_filename, 'r') as f:
            for line in f:
                count, word = line.split()
                self.tf[word] = int(count)


    @abc.abstractmethod
    def summarize(self, document_path):
        return
示例#3
0
    def __init__(self, all_path, stopwords_path):

        # specific path for file to parse
        self.all = all_path
        # the file name of the BMWeights
        self.bm25_file = 'BM25Weights.json'
        # the porter to stem the words
        self.porter = porter.PorterStemmer()
        # the stopwords which is read from the stopwords_path
        self.stopwords = self._read_stopwords(stopwords_path)
        # the doc k:docid v:{k:len,v:doclen,k:tfs,v:tfs} tfs is a dict with k:term v: a number of the frequency
        self.docdict = {}
        # the idfs k:term v: a number of the over all frequency
        self.idfs = {}
        # the BM25 value dict k:docid v:{k:term,v:score}
        self.BM25 = {}
        # the k of the formula
        self.k = 1
        # the b of the formula
        self.b = 0.75

        # judge that if here is the BM25Weightfile:
        if os.path.exists(self.bm25_file):
            #load the BM25Weights
            print("Loading BM25 index from file, please wait. \n")
            self.load_bm25()
        else:
            # calcualte the bm25 score and store
            print("BM25 index is not exists")
            print("Generateing BM25 index....")
            self.calculate_bm25()
            self.save_bm25()
def getItemWords(list_of_words, stop_words):
    stemmer = porter.PorterStemmer()
    allwords = {}
    itemwords = []
    itemtitles = []
    ec = 0
    stemlist = {}
    # Loop over every item in list_of_words
    for item in list_of_words:
        words = separatewords(item, 1)
        words = removeStopWords(words, stop_words)
        itemwords.append({})
        itemtitles.append("Response " + str(ec + 1))
        # Increase the counts for this word in allwords and in articlewords
        for word in words:
            unstemmedword = word
            word = stemmer.stem(word, 0, len(word) - 1)
            if word in stemlist:
                temp = stemlist[word]
                try:
                    temp.index(unstemmedword)
                except ValueError:
                    temp.append(unstemmedword)
                    stemlist[word] = temp
            else:
                temp = []
                temp.append(unstemmedword)
                stemlist[word] = temp
            allwords.setdefault(word, 0)
            allwords[word] += 1
            itemwords[ec].setdefault(word, 0)
            itemwords[ec][word] += 1
        ec += 1
    return allwords, itemwords, itemtitles, stemlist
示例#5
0
def stem_terms(terms):
    stemmmed_terms = []
    p = porter.PorterStemmer()
    for i in range(len(terms)):
        stemmmed_terms.append(stem_term(terms[i], p))

    return stemmmed_terms
示例#6
0
文件: asd.py 项目: sriya29/ir_project
def normalize(word):
    regex = re.compile('[^a-zA-Z0-9]')
    word = regex.sub("", word)
    word = word.lower()
    word_obj = porter.PorterStemmer(word)
    word_obj.stem(word, 0, len(word) - 1)
    word = word_obj.word
    return word
示例#7
0
文件: rtrvr.py 项目: mananraheja/PYPL
 def __init__(self,
              docset_map_file='',
              punctuation_marks='',
              stoplist_file=''):
     self.__docMap = text.PickleStrMap.load(docset_map_file)
     self.__txtprr = text.TextProcessor(
         punctuation_marks, text.PickleStrMap.load(stoplist_file),
         porter.PorterStemmer())
示例#8
0
def test_05():
    stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl')
    fnfltr = fnfilter.TextFileFilter()
    ftdsmap = text.FileToDocSetMap()
    stm = porter.PorterStemmer()
    tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm)
    docset = tp.processFile(my_test_dir + 'poem.txt')
    print docset
示例#9
0
def test_03():
    stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl')
    fnfltr = fnfilter.TextFileFilter()
    ftdsmap = text.FileToDocSetMap()
    stm = porter.PorterStemmer()
    tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm)
    crwlr = crawl.Crawler(fnfltr, ftdsmap, tp)
    crwlr.crawl(my_test_dir + 'texts/')
    crwlr.pickleFileToDocSetMap(my_test_dir + 'file_to_docset_map.pkl')
示例#10
0
def search(keystring):
    file = open('index.txt', encoding='gbk')
    js = file.read()
    dicread = json.loads(js)
    file.close()
    N = dicread['N']
    k = 1
    b = 0.75
    avg = dicread['avg_doclen']
    stemmer = porter.PorterStemmer()
    stopwords = set()
    with open('stopwords.txt', 'r') as f:
        for line in f:
            stopwords.add(line.rstrip())

    keystring = keystring.split(None)
    keylist = []
    for value in keystring:
        if value not in stopwords:
            value = stemmer.stem(value)
            value = value.lower()
            keylist.append(value)
    dic_ij = {}
    dic_ni = {}
    for term in keylist:
        dicij = {}
        i = 0
        for id in dicread['house_index_info']:
            if term in dicread['house_index_info'][id]:
                fij = dicread['house_index_info'][id][term]
                dicij[id] = fij
                i = i + 1
                dic_ni[term] = i
            else:
                fij = 0
                dicij[id] = fij
                dic_ni[term] = i
            dic_ij[term] = dicij
    bmij = {}
    for docid in dicread['house_index_info']:
        sim = 0
        for term in keylist:
            fij = dic_ij[term][docid]
            ni = dic_ni[term]
            len = dicread['indoclen'][docid]
            sim = sim + (fij * (1 + k) /
                         (fij + k * (1 - b + ((b * len) / avg)))) * math.log(
                             ((N - ni + 0.5) / (ni + 0.5)), 2)
        bmij[docid] = sim
    bmrank = sorted(bmij.items(), key=lambda x: x[1], reverse=True)[0:100]
    # dic_result = {}
    keylist = []
    for key in bmrank:
        keylist.append(key[0])
    return keylist
示例#11
0
 def getSynonym(self, word):
     synonyms = []
     p = porter.PorterStemmer()
     for synset in wn.synsets(word):
         for lemma in synset.lemmas():
             syn = lemma.name()
             if parameters.stemming:  # stem sysnonym if stemming parameter is set True.
                 syn = p.stem(lemma.name(), 0, len(lemma.name()) - 1)
             if syn not in synonyms:
                 synonyms.append(syn)
     return synonyms
示例#12
0
文件: spread.py 项目: yep/research
def stem(li,cols=0):
  if cols == 0: cols = range(len(li[0]))
  import porter
  pstemmer = porter.PorterStemmer()
  newlist = copy.deepcopy(li)
  for i in range(len(li)):
    for j in cols:
      string = str(li[i][j])
      for ch in "'"+'"+[]?!\n': string = string.replace(ch,'')
      words = string.split(' ')
      newlist[i][j] = ' '.join([pstemmer.stem(x.strip().lower(),0,len(x.strip())-1) for x in words])
  return newlist
示例#13
0
def porterTokenizer(corpusString):
    p = porter.PorterStemmer()
    output = ''
    word = ''
    lines = corpusString.split('\n')
    for line in lines:
        for c in line:
            if c.isalpha():
                word += c.lower()
            else:
                if word:
                    output += p.stem(word, 0, len(word) - 1)
                    word = ''
                output += c.lower()
    return output.split()
def Q2a():
    p = porter.PorterStemmer()
    text = nltk.load('text.txt', encoding='gbk')  # code for Q2a
    token_list = nltk.word_tokenize(text)
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''", "-"
    ]

    token_list = [
        word for word in token_list if word not in english_punctuations
    ]
    token_list1 = [w.lower() for w in token_list]
    print(token_list1)
    token_list2 = [p.stem(w) for w in token_list1]
    print(token_list2)
示例#15
0
文件: ds2_dev.py 项目: roeiba/WikiRep
def compare(doc1, doc2):
    """ strip all punctuation but - and '
        convert to lower case
        store word/occurance in dict
    """

    words_db = WordsDb(stemmer=porter.PorterStemmer())

    for text in [doc1, doc2]:
        words_db.add_article(text)

    words_db.bulid()

    v1 = words_db.classify(doc1)
    v2 = words_db.classify(doc2)

    print v1
    print v2

    return float(dot(v1, v2) / (norm(v1) * norm(v2)))
def Q3():
    p = porter.PorterStemmer()
    stopwords = []
    with open('stopwords.txt', 'r') as f:
        for line in f:
            stopwords.append(line.rstrip())
        f.close()
    # print(stopwords)
    temp = requests.get("https://www.bbc.com/news/world-us-canada-49871909")
    temp.encoding = 'utf-8'
    soup = BeautifulSoup(temp.content, 'html.parser')
    text_1 = soup.find('div', {'class': 'story-body__inner'}).findAll('p')
    # text_1.remove('<p>')
    text_1 = [part.get_text() for part in text_1]
    text_1 = [nltk.word_tokenize(sen) for sen in text_1]
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#',
        '$', '%', '"', '\'s', '``', "''", "-"
    ]
    text_1 = [[word for word in sens if word not in english_punctuations]
              for sens in text_1]
    text_1 = [[word for word in sens if word not in stopwords]
              for sens in text_1]
    text_1 = [nltk.pos_tag(sen) for sen in text_1]
    # print(text_1)

    result = []

    for sen in text_1:
        for word in sen:
            if "V" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0].lower(), 'v')
            elif "N" in word[1]:
                w = WordNetLemmatizer().lemmatize(word[0], 'n')
            else:
                w = p.stem(word[0])
            result.append(w.lower())
    # print(result)
    fdist = FreqDist(result)
    tops = fdist.most_common(40)
    print(tops)
def preProcessing(content):
	stopSet = getStopSet()
	p = porter.PorterStemmer()
	info = []
	for line in content:
		newLine = ""
		line = line.split(" ")
		for element in line:
			temp = element.split("://")
			temp1 = element.split("@")
			temp2 = element.split("#")
			temp3 = element.split("/")
			if len(temp)<2 and len(temp1)<2 and len(temp2)<2 and len(temp3)<2:
				element = element.strip()#clean the '\n' 
				element = element.lower()
				element = element.translate(str.maketrans('','', string.punctuation))
				element = p.stem(element)
				if element not in stopSet:
					newLine = newLine+element+" "
		info.append(newLine)
	return info
示例#18
0
 def Porter_extraction(self, file_path):
     p = porter.PorterStemmer()
     r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
     r_num = '[0-9]'
     r = re.compile(r)
     r_num = re.compile(r_num)
     text = ''
     with open(file_path, 'r') as infile:
         output = ''
         word = ''
         line = infile.read()
         for c in line:
             if c.isalpha():
                 word += c.lower()
             else:
                 if word:
                     output += p.stem(word, 0, len(word) - 1)
                     word = ''
                 output += c.lower()
         text += (' ' + re.sub(r, ' ', re.sub(r_num, ' ', output)))
     return text
def main():
    # stores count of each words appearing in a file at least once
    # does not double count word appearance in a signle file

    ps = pstem.PorterStemmer()

    fileG = open("groundtruth.csv", "w")
    porter = False
    stopWord = None
    root_dir = sys.argv[1]
    if (sys.argv[2] == "TRUE"):
        porter = True
    if (sys.argv[3] != "NULL"):
        stopWord = sys.argv[3]

    if (stopWord != None):
        fileH = open(stopWord, "r")
        stopWord = []
        line = fileH.readlines()
        stopWord = [lines.rstrip('\n') for lines in line]
        fileH.close()

    test = [
        dI for dI in os.listdir(root_dir + "/C50test")
        if os.path.isdir(os.path.join(root_dir + "/C50test", dI))
    ]
    train = [
        dI for dI in os.listdir(root_dir + "/C50test")
        if os.path.isdir(os.path.join(root_dir + "/C50train", dI))
    ]

    totalDict = {}
    allVectors = []
    print("Reading test data...")
    for name in test:
        dir_name = root_dir + "/C50test/" + name
        files = os.listdir(dir_name)
        for file in files:
            fileG.write(file + "," + name + "\n")
            file_name = dir_name + "/" + file
            with open(file_name, 'r') as myfile:
                document = myfile.read().replace('\n', ' ')
                document = document.replace('.',
                                            ' ').replace(",", " ").replace(
                                                "\"", "").replace("-", " ")
                document = document.replace("-", "").replace("(", "").replace(
                    ")", "").replace("$", "")
                document = document.replace("?", "").replace("!", "").replace(
                    "#", "").replace("/", " ")
                document = re.sub('\d', '', document)
                document = document.lower().split()
                hitWords = []
                thisDict = {}
                for word in document:
                    if (stopWord != None and word in stopWord):
                        continue
                    word = word.replace("'", "")
                    if (word == ""):
                        continue
                    if (porter == True):
                        word = ps.stem(word, 0, len(word) - 1)
                    if (word not in hitWords):
                        if word in totalDict:
                            totalDict[word] += 1
                        else:
                            totalDict[word] = 1
                        hitWords.append(word)
                    if word not in thisDict:
                        thisDict[word] = 1
                    else:
                        thisDict[word] += 1
                temp = Vector(file, name)
                temp.vector = thisDict
                allVectors.append(temp)

    print("Reading train data...")
    for name in train:
        dir_name = root_dir + "/C50train/" + name
        files = os.listdir(dir_name)
        for file in files:
            fileG.write(file + "," + name + "\n")
            file_name = dir_name + "/" + file
            with open(file_name, 'r') as myfile:
                document = myfile.read().replace('\n', ' ')
                document = document.replace('.',
                                            ' ').replace(",", " ").replace(
                                                "\"", "").replace("-", " ")
                document = document.replace("-", "").replace("(", "").replace(
                    ")", "").replace("$", "")
                document = document.replace("?", "").replace("!", "").replace(
                    "#", "").replace("/", " ")
                document = re.sub('\d', '', document)
                document = document.lower().split()
                hitWords = []
                thisDict = {}
                for word in document:
                    if (stopWord != None and word in stopWord):
                        continue
                    word = word.replace("'", "")
                    if (word == ""):
                        continue
                    if (porter == True):
                        word = ps.stem(word, 0, len(word) - 1)
                    if (word not in hitWords):
                        if word in totalDict:
                            totalDict[word] += 1
                        else:
                            totalDict[word] = 1
                        hitWords.append(word)
                    if word not in thisDict:
                        thisDict[word] = 1
                    else:
                        thisDict[word] += 1
                temp = Vector(file, name)
                temp.vector = thisDict
                allVectors.append(temp)

    print(len(allVectors))
    fileNoNorm = open("plainVector.csv", "w")
    for vect in allVectors:
        fileNoNorm.write(vect.authorName + "," + vect.fileName + ",")
        vectDict = vect.vector
        for elem in vectDict:
            fileNoNorm.write(elem + " " + str(vectDict[elem]) + ",")
        fileNoNorm.write("\n")
    fileNoNorm.close()

    tfidfVect = copy.deepcopy(allVectors)
    for thing in tfidfVect:
        curDict = thing.vector
        maximum = max(curDict.values())
        for k in curDict:
            curDict[k] /= maximum
            curDict[k] = curDict[k] * math.log(5000 / totalDict[k], 2)

    fileNoNorm = open("tfidfVector.csv", "w")
    for vect in tfidfVect:
        fileNoNorm.write(vect.authorName + "," + vect.fileName + ",")
        vectDict = vect.vector
        for elem in vectDict:
            fileNoNorm.write(elem + " " + str(vectDict[elem]) + ",")
        fileNoNorm.write("\n")
    fileNoNorm.close()

    fileG.close()
示例#20
0
def test_02():
    stp = text.PickleStrMap.load(my_test_dir + 'stoplist.pkl')
    stm = porter.PorterStemmer()
    tp = text.TextProcessor(' `~!@#$%%^&*()_+{}|\[];\':";\',./?><', stp, stm)
    docset = tp.processFile(my_test_dir + 'poem.txt')
    print docset
示例#21
0
    def __init__(self):

        # the BM25 model
        self.model = {}
        # the poter to stem the input
        self.porter = porter.PorterStemmer()
示例#22
0
class FeatureSet(object):
    """ Set of features for a dictionary. """

    splitter = re.compile( "[a-z0-9]+(?:['\-][a-z0-9]+)*", re.I )
    dates = re.compile( r'\b\d\d\d\d\b|\'\d\d\b' )
    numbers = re.compile( "\d\+" )
    apos = re.compile("'$")
    stemmer = porter.PorterStemmer()
    words = None
    stop_words = None
    st = None
    wt = None
    pronouns = None
    fset = None
    
    def __init__(self, name="", features=None):
        """ Initializes a feature set. """

        # Load various libraries / dictionaries if they haven't been
        if FeatureSet.pronouns is None:
            FeatureSet.pronouns = loadDictionary(PRONOUN_FILENAME)
        if FeatureSet.words is None:
            FeatureSet.words = loadDictionary(DICT_FILENAME)
        if FeatureSet.stop_words is None:
            FeatureSet.stop_words = loadDictionary(STOP_FILENAME)
        if FeatureSet.st is None:
            # FeatureSet.st = punkt.PunktSentenceTokenizer(gutenberg.raw(gutenberg.files()))
            FeatureSet.st = punkt.PunktSentenceTokenizer()
        if FeatureSet.wt is None:
            FeatureSetwt = punkt.PunktWordTokenizer()

        # predefined set of features?
        if features is None:
            self.features = {}
        else:
            self.features = features
            
        # article name
        self.name = name

    def getFeatures(self):
        return self.features
    def getFeature(self, f):
        if f in self.features:
            return self.features[f]
        return 0

    def incrFeature(self,f):
        if f in self.features:
            self.features[f] += 1
        else:
            self.features[f] = 1
    def setFeature(self,f,val):
        self.features[f] = val

    def __iter__(self):
        return self.features.__iter__()
    def keys(self):
        return self.features.keys()
    def __eq__(self, other):
        for i in self.features:
            if i == "SIMS":
                continue
            if i in other.features:
                if other.features[i] != self.features[i]:
                    return False
            else:
                if self.features[i] != 0:
                    return False
        return True

    def __add__(self,other):
        f = self.features.copy()
        for i in other.features:
            if i in self.features:
                f[i] = self.features[i] + other.features[i]
            else:
                f[i] = other.features[i]
        return FeatureSet(f)

    def __radd__(self,other):
        for i in other.features:
            if i in self.features:
                self.features[i] += other.features[i]
            else:
                self.features[i] = other.features[i]

    def __mul__(self,other):
        dotprod = 0
        if type(other) == dict:
            fs = other
        else:
            fs = other.features
        for i in fs:
            if i in self.features:
                dotprod += self.features[i] * fs[i]

        return dotprod    
        
        
    def extractFeatures(self, text):
        """ Extracts features based on text.  Clears any existing features."""

        # Working text (will have things deleted)
        wtext = text

        # Clear dict before importing new features
        self.features = {}

        # Clean text
        words = FeatureSet.splitter.findall(text)

        # Number of words, sentences, questions, exclamations
        self.features["WORD"] = len(words)
        self.features["SENT"] = len(FeatureSet.st.tokenize(text))
        self.features["QUES"] = text.count("?")
        self.features["EXCL"] = text.count("!")

        # If we have an article name provided, find instances of that
        if (self.name != ""):
            occurs = 0
            namesplit = FeatureSet.splitter.findall(self.name)
            for i in namesplit:
                namepart = re.compile(r'\b'+i+r'\b',re.I)
                occurs += len(namepart.findall(text))
                wtext = namepart.sub("", wtext)
            self.features["NAME"] = occurs
            

        # Find dates
        self.features["DATE"] = len(FeatureSet.dates.findall(wtext))
        wtext = FeatureSet.dates.sub("", wtext)

        # Remove other numbers
        self.features["NUM"] = len(FeatureSet.numbers.findall(wtext))
        wtext = FeatureSet.numbers.sub("", wtext)

        # Now look for words / bigrams / positions
        pronouns = 0 # num pronouns
        propers = 0  # num proper nouns

        prev = "" # end marker
        i = -1.0
        length = len(wtext)
        wtext_words = FeatureSet.splitter.findall(wtext)
        for w in wtext_words:
            i += 1
            wl = w.lower()
            if wl in FeatureSet.pronouns:
                pronouns += 1
                continue
            if wl in FeatureSet.stop_words:
                # If this is a stop word, just ignore it
                continue
            if not wl in FeatureSet.words and wl != w:
                # Capital and not in word list, so assume it's a proper noun
                propers += 1
                continue

            ws = FeatureSet.stemmer.stem(wl,0,len(wl)-1)
            ws = FeatureSet.apos.sub("", ws)

            if FeatureSet.fset is None or "UNI_"+ws.upper() in FeatureSet.fset:
                self.incrFeature("UNI_"+ws.upper())
            if prev != "" and (FeatureSet.fset is None or "BI_"+prev.upper()+"_"+ws.upper() in FeatureSet.fset):
                self.incrFeature("BI_"+prev.upper()+"_"+ws.upper())
            if (not "POS_"+ws.upper() in self.features) and (FeatureSet.fset is None or "POS_"+ws.upper() in FeatureSet.fset):
                self.features["POS_"+ws.upper()] = i/length

            prev = ws

        firstword = FeatureSet.stemmer.stem(words[0],0,len(words[0])-1).upper()
        if FeatureSet.fset is None or "FIRST_"+firstword in FeatureSet.fset:
            self.features["FIRST_" + firstword] = 1
       
        if len(words) > 1:
            secondword = FeatureSet.stemmer.stem(words[1],0,len(words[1])-1).upper()
            if FeatureSet.fset is None or "SECOND_"+firstword+"_"+secondword in FeatureSet.fset:
                self.features["SECOND_" + firstword + "_" + secondword] = 1

        self.features["PROP"] = propers
        self.features["PRON"] = pronouns
示例#23
0
def call_query(query, collection, i, brf, brf_count, brf_number_words,
               brf_from, stopwords, thesaurus, normalization):
    # clean query
    if parameters.case_folding:
        query = query.lower()
    query = re.sub(r'[^ a-zA-Z0-9]', ' ', query)
    query = re.sub(r'\s+', ' ', query)
    query_words = query.split(' ')

    if (thesaurus):
        query_with_thesaurus = []
        for word in query_words:
            query_with_thesaurus.append(word)
            syns = getSynonyms(word)
            for syn in syns:
                query_with_thesaurus.append(syn)
        query_words = query_with_thesaurus

    # create accumulators and other data structures
    accum = {}
    filenames = []
    p = porter.PorterStemmer()

    # get N
    f = open("indexes/" + collection + "_index_N", "r")
    N = eval(f.read())
    f.close()

    # get document lengths/titles
    titles = {}
    f = open("indexes/" + collection + "_index_len", "r")
    lengths = f.readlines()
    f.close()

    # get index for each term and calculate similarities using accumulators
    for term in query_words:
        if stopwords and (term in stop_words):
            continue
        if term != '':
            if parameters.stemming:
                term = p.stem(term, 0, len(term) - 1)
            if not os.path.isfile("indexes/" + collection + "_index/" + term):
                continue
            f = open("indexes/" + collection + "_index/" + term, "r")
            lines = f.readlines()
            idf = 1
            if parameters.use_idf:
                df = len(lines)
                idf = 1 / df
                if parameters.log_idf:
                    idf = math.log(1 + N / df)
            for line in lines:
                mo = re.match(r'([0-9]+)\:([0-9\.]+)', line)
                if mo:
                    file_id = mo.group(1)
                    tf = float(mo.group(2))
                    if not file_id in accum:
                        accum[file_id] = 0
                    if parameters.log_tf:
                        tf = (1 + math.log(tf))
                    accum[file_id] += (tf * idf)
            f.close()

    # parse lengths data and divide by |N| and get titles
    for l in lengths:
        mo = re.match(r'([0-9]+)\:([0-9\.]+)\:(.+)', l)
        if mo:
            document_id = mo.group(1)
            length = eval(mo.group(2))
            title = mo.group(3)
            if document_id in accum:
                if normalization:
                    accum[document_id] = accum[document_id] / length
                titles[document_id] = title

    # print top ten results
    results = sorted(accum, key=accum.__getitem__, reverse=True)
    final_result = []
    #print(collection+" "+query)
    for c in range(min(len(results), 10)):
        #print ("{0:10.8f} {1:5} {2}".format (accum[result[c]], result[c], titles[result[c]]))
        final_result.append([accum[results[c]], results[c]])
    if (brf and brf_count == 0):
        total = 0
        for result in results:
            if total >= brf_from:
                break
            total += 1
            document = result
            #accumulation = result[0]
            f = open(
                "indexes/tf-idf/testbed" + str(i) + "_document_" +
                str(document) + "_tf-idf", "r")
            lines = f.readlines()
            f.close()
            c = 0
            d = 0
            word = ""
            while (c < brf_number_words and len(lines) > d):
                mo = lines[d].split(":")
                if (word == mo[1].replace("\n", "")
                        or mo[1].replace("\n", "") in stop_words):
                    d += 1
                    continue
                word = mo[1].replace("\n", "")
                query += " " + word
                d += 1
                c += 1
        final_result = call_query(query, collection, i, brf, brf_count + 1,
                                  brf_number_words, brf_from, stopwords,
                                  thesaurus, normalization)
    return final_result
示例#24
0
def main(q_id, collection_name, query_text):
    print(q_id + " " + collection_name + " " + query_text)
    MIN_RESULT_LENGTH = 30
    OUT_DIR = "testbed/"
    RESULT_FILE = "control_results.txt"
    if parameters.use_thesaurus:
        RESULT_FILE = "thesaurus_results.txt"

    # construct collection and query
    query_id = q_id
    collection = collection_name
    query = query_text

    # clean query
    if parameters.case_folding:
        query = query.lower()
    query = re.sub(r'[^ a-zA-Z0-9]', ' ', query)
    query = re.sub(r'\s+', ' ', query)
    query_words = query.split(' ')

    # Check if using thesaurus
    # Design is to get synonyms for each query term,
    # Excluding synonyms that are longer than one word, as the breaking
    # Up of many words into their constituent words
    # Might not make sense as a synonym.
    # Particularly since the system search on term basis and not a phrase basis

    if parameters.use_thesaurus:
        added_synonyms = []
        for term in query_words:
            thesaurus = py_thesaurus.WordAnalyzer(term)
            synonyms = thesaurus.get_synonym()
            # ignore synonyms that are more than one word long
            allowed_synonyms = []
            for s in synonyms:
                if (len(s.split(" ")) == 1):
                    allowed_synonyms.append(s)
            for s in allowed_synonyms:
                if s not in added_synonyms:
                    added_synonyms.append(s)
        query_words.extend(added_synonyms)  # list of synonyms for a word

    # create accumulators and other data structures
    accum = {}
    filenames = []
    p = porter.PorterStemmer()

    # get N
    f = open(collection + "_index_N", "r")
    N = eval(f.read())
    f.close()

    # get document lengths/titles
    titles = {}
    f = open(collection + "_index_len", "r")
    lengths = f.readlines()
    f.close()

    # get index for each term and calculate similarities using accumulators
    for term in query_words:
        if term != '':
            if parameters.stemming:
                term = p.stem(term, 0, len(term) - 1)
            if not os.path.isfile(collection + "_index/" + term):
                continue
            f = open(collection + "_index/" + term, "r")
            lines = f.readlines()
            idf = 1
            if parameters.use_idf:
                df = len(lines)  # document frequency of a word
                idf = 1 / df
                if parameters.log_idf:
                    idf = math.log(1 + N / df)
            for line in lines:
                mo = re.match(r'([0-9]+)\:([0-9\.]+)', line)
                if mo:
                    file_id = mo.group(1)
                    tf = float(mo.group(2))
                    if not file_id in accum:
                        accum[file_id] = 0
                    if parameters.log_tf:
                        tf = (1 + math.log(tf))
                    accum[file_id] += (tf * idf)
            f.close()

    # parse lengths data and divide by |N| and get titles
    for l in lengths:
        mo = re.match(r'([0-9]+)\:([0-9\.]+)\:(.+)', l)
        if mo:
            document_id = mo.group(1)
            length = eval(mo.group(2))
            title = mo.group(3)
            if document_id in accum:
                if parameters.normalization:
                    accum[document_id] = accum[document_id] / length
                titles[document_id] = title

    # print top ten results
    result = sorted(accum, key=accum.__getitem__, reverse=True)
    for i in range(min(len(result), MIN_RESULT_LENGTH)):
        print("{0:10.8f} {1:5} {2}".format(accum[result[i]], result[i],
                                           titles[result[i]]))

    def write_to_result_file(result, query_id):
        run_id = "control"
        output = OUT_DIR + RESULT_FILE
        if parameters.use_thesaurus:
            run_id = "thesaurus"
        if not os.path.isdir(OUT_DIR):
            os.mkdir("testbed")
            output = "testbed/" + RESULT_FILE
            print("Writing results to: " + output)
        else:
            print("Writing results to: " + output)
        with open(output, "a") as f:
            for i in range(min(len(result), MIN_RESULT_LENGTH)):
                # <query-id> <literal '0'> <document-id> <rank> <score> <run-id>
                f.write(
                    str(query_id) + " 0 " + str(result[i]) + " " + str(i) +
                    " " + str(accum[result[i]]) + " " + run_id + "\n")
        f.close()

    write_to_result_file(result, query_id)
示例#25
0
 def __init__(self):
     LanguageModule.__init__(self)
     import porter
     self._stemmer = porter.PorterStemmer()
示例#26
0
class Compare:
    score = 0
    doc1 = ''
    doc2 = ''
    __splitter = re.compile("[a-zA-Z\-']+", re.I)
    __stemmer = porter.PorterStemmer()

    def __del__(self):
        class_name = self.__class__.__name__
        print class_name, "destroyed"

    def setDoc1(self, doc1):
        self.doc1 = doc1

    def setDoc2(self, doc2):
        self.doc2 = doc2

    def add_word(self, word, d):
        """
       Adds a word the a dictionary for words/count
       first checks for stop words
       the converts word to stemmed version
     """
        w = word.lower()
        # if w not in stop_words:
        # ws=stemmer.stem(w,0,len(w)-1)
        ws = w
        d.setdefault(ws, 0)
        d[ws] += 1

    def doc_vec(self, doc, key_idx):
        v = zeros(len(key_idx))
        for word in self.__splitter.findall(doc):
            # keydata=key_idx.get(stemmer.stem(word,0,len(word)-1).lower(), None)
            keydata = key_idx.get(word.lower(), None)
            # if keydata: v[keydata[0]] = 1
            if keydata: v[keydata[0]] += 1
        return v

    def compare(self):
        # strip all punctuation but - and '
        # convert to lower case
        # store word/occurance in dict
        all_words = dict()

        for dat in [self.doc1, self.doc2]:
            [self.add_word(w, all_words) for w in self.__splitter.findall(dat)]

        # build an index of keys so that we know the word positions for the vector
        key_idx = dict()  # key-> ( position, count )
        keys = all_words.keys()
        keys.sort()
        for i in range(len(keys)):
            key_idx[keys[i]] = (i, all_words[keys[i]])
        del keys
        del all_words

        v1 = self.doc_vec(self.doc1, key_idx)
        v2 = self.doc_vec(self.doc2, key_idx)
        # return math.acos(float(dot(v1,v2) / (norm(v1) * norm(v2))))
        # return math.acos(float(dot(v1,v2) / (norm(v1) * norm(v2))))
        try:
            degreeScore = math.degrees(
                math.acos(float(dot(v1, v2) / (norm(v1) * norm(v2)))))
        except:
            degreeScore = 0
        return degreeScore
示例#27
0
def index(folder_name, i):
    print('Indexing Testbed ' + str(i))
    # Make index directories
    try:
        os.makedirs("indexes/testbed" + str(i) + "_index")
    except:
        pass
    try:
        os.makedirs("indexes/tf-idf/")
    except:
        pass
    data = {}
    print("indexing testbed" + str(i), end="")
    #read in files
    for j in range(1, 201):
        document = ''
        f = open(folder_name + "/document." + str(j),
                 "r",
                 encoding="ISO-8859-1")
        if parameters.case_folding:
            for line in f.readlines():
                document += line.lower() + " "
        else:
            for line in f.readlines():
                document += line + " "
        if (document != ''):
            data[str(j)] = document
        f.close()

    # document length/title file
    g = open("indexes/" + "testbed" + str(i) + "_index_len", "w")

    # create inverted files in memory and save titles/N to file
    index = {}
    N = len(data.keys())
    p = porter.PorterStemmer()
    for key in data:
        #write over dtf
        tf_idf = open(
            "indexes/tf-idf/" + "testbed" + str(i) + "_document_" + str(key) +
            "_tf-idf", "w")
        tf_idf.write("")
        tf_idf.close()

        content = re.sub(r'[^ a-zA-Z0-9]', ' ', data[key])
        content = re.sub(r'\s+', ' ', content)
        words = content.split(' ')
        doc_length = 0
        for word in words:
            if word != '':
                if parameters.stemming:
                    word = p.stem(word, 0, len(word) - 1)
                doc_length += 1
                if not word in index:
                    index[word] = {key: 1}
                else:
                    if not key in index[word]:
                        index[word][key] = 1
                    else:
                        index[word][key] += 1
        print(key, doc_length, key, sep=':', file=g)

    # document length/title file
    g.close()

    tf_idf_arr = {}  # dict of tf-idf scores

    for key in index:
        if (len(key) > 30):
            continue
        f = open("indexes/testbed" + str(i) + "_index/" + key, "w")
        for entry in index[key]:
            if (not (entry in tf_idf_arr)):
                tf_idf_arr[entry] = []
            # additionally calculate the tf-idf for use in Blind relevance feedback
            print(entry, index[key][entry], sep=':', file=f)
            tf = float(index[key][entry])
            idf = 1
            if parameters.use_idf:
                df = len(index[key])
                idf = 1 / df
                if parameters.log_idf:
                    idf = math.log(1 + N / df)
            tf_idf_arr[entry].append([tf * idf, key])
        f.close()
    # write N
    f = open("indexes/testbed" + str(i) + "_index_N", "w")
    print(N, file=f)
    f.close()
    # sort on tf_idf
    for j in tf_idf_arr:
        tf_idf_arr[j].sort(key=lambda k: (k[0], k[1]), reverse=True)
        # Write tf-idf to file
        tf_idf = open(
            "indexes/tf-idf/" + "testbed" + str(i) + "_document_" + str(j) +
            "_tf-idf", "w")
        for line in tf_idf_arr[j]:
            print(j)
            print(line[0], line[1], sep=':', file=tf_idf)
        tf_idf.close()
    print('Indexing Testbed ' + str(i) + ' Done')
示例#28
0
 def __init__(self):
     self.stemmer = porter.PorterStemmer()
示例#29
0
linenum = 0
tweet = []
sent = []

#for row in reader:
#	sent.append(row[0])
#	tweet.append(row[1])

#print '[log]-Done reading csv'

for line in f
input_file.close()

#use porter to stem english words 
stemmed_tweets = []
p = porter.PorterStemmer()
for t in tweet:
    t_n = re.sub(url+'|'+username,'', t, flags=re.MULTILINE)
    t_n.strip()
    tweet_list = t_n.split()
    s =''
    for w in tweet_list:
            s_w = p.stem(w,0,len(w)-1)
            s = s +' '+s_w
    
    s = s.strip()
    stemmed_tweets.append(s)

print '[log]- stemming done'

pickle.dump(stemmed_tweets,open('prep_tweets.p','wb'))
示例#30
0
文件: ir_tools.py 项目: xtype0x/ir_hw
def stemword(word):
	p = porter.PorterStemmer()
	return p.stem(word,0,len(word)-1)