Пример #1
0
    def __init__(self, parent, docno, doc, terms):
        QtGui.QDialog.__init__(self, parent)

        self.setupUi(self)

        # Set fields
        self.labelDocumentNo.setText(docno)

        textDocument = self.textEdit.document()
        textCursor = QtGui.QTextCursor(textDocument)

        normalFormat = QtGui.QTextCharFormat()
        termFormat = QtGui.QTextCharFormat()
        termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
        termFormat.setFontWeight(QtGui.QFont.Bold)

        textCursor.beginEditBlock()

        stemmer = PorterStemmer()
        terms = terms.split(",")
        stemmed_terms = [stemmer.stem(term, 0, len(term)-1) for term in terms]

        for line in unicode(doc).split("\n"):
            for word in line.split(" "):
                nword = word.lower().strip(punctuation)
                sword = stemmer.stem(nword, 0, len(nword)-1)
                if nword in terms or sword in stemmed_terms:
                    textCursor.insertText(word, termFormat)
                else:
                    textCursor.insertText(word, normalFormat)
                textCursor.insertText(" ", normalFormat)

            textCursor.insertText("\n", normalFormat)

        self.textEdit.moveCursor(QtGui.QTextCursor.Start)
Пример #2
0
 def stemWord(self, fileName, preprocessedFileName=''):
     "Stemming word and write to temp file"
     p = PorterStemmer()
     print('Preprocessing...')
     print('Stemming words...')
     if len(preprocessedFileName) != 0:
         self.tempFileName = preprocessedFileName
     with open(self.tempFileName, 'w') as outputfile:
         with open(fileName, 'r') as file:
             while 1:
                 word = ''
                 line = file.readline()
                 if line == '':
                     break
                 # skip first word(category)
                 category = ''
                 for ch in line:
                     if ch == ' ':
                         if len(category) != 0:
                             outputfile.write(category + ' ')
                             break
                     else:
                         category += ch
                 # skip first word (category label)
                 for i in range(len(category) + 1, len(line)):
                     if line[i].isalpha():
                         word += line[i].lower()
                     else:
                         if word:
                             outputfile.write(p.stem(
                                 word, 0,
                                 len(word) - 1))
                             word = ''
                         outputfile.write(line[i].lower())
Пример #3
0
def convert_keyboard_query():
    qry = raw_input("Type in your query:")
    words = qry.strip().split(' ')
    p = PorterStemmer()
    QUERY_WEIGHT = 2
    new_doc_vec = defaultdict(int)
    for word in words:
        word = word.strip()
        if re.search('[a-zA-Z]', word):
            word = word.lower()
            word = p.stem(word, 0, len(word) - 1)
            if word in new_doc_vec:
                new_doc_vec[word] += QUERY_WEIGHT
            elif word not in stoplist_hash and word in corp_freq_hash:
                new_doc_vec[word] = QUERY_WEIGHT
            else:
                continue

    new_vect = defaultdict(int)
    for key in new_doc_vec:
        new_vect[key] = new_doc_vec[key]
        if key in synonyms:
            sim_words_list = synonyms_list[synonyms[key]]
            for sim_word in sim_words_list:
                if sim_word not in stoplist_hash and re.search(
                        "[a-zA-z]", sim_word):
                    if corp_freq_hash[sim_word] > 1:
                        new_vect[sim_word] = new_doc_vec[key]

    return new_vect
def getStemWords(query_line, stopwords):
    raw_data = query_line.replace(".", "").replace(",", "").replace('"', "").replace("\n", "").replace("-", " ") \
        .replace("(", "").replace(")", "").split(" ")

    for i in stopwords:
        while i in raw_data:
            raw_data.remove(i)

    stemmedArray = raw_data
    p = PorterStemmer()

    for i in range(1, stemmedArray.__len__()):
        while stemmedArray[i] != p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1):
            stemmedArray[i] = p.stem(stemmedArray[i], 0, len(stemmedArray[i]) - 1)

    return raw_data[0], raw_data[1:], stemmedArray[1:]
def process_word(token):
    token = token.lower()
    if constants.STEM is True:
        p = PorterStemmer()
        token = p.stem(token, 0,len(token)-1)                       
    
    return token
Пример #6
0
class Parser:

	#A processor for removing the commoner morphological and inflexional endings from words in English
	stemmer=None

	stopwords=[]

	def __init__(self,):
		self.stemmer = PorterStemmer()

		#English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
		#self.stopwords = open('data/english.stop', 'r').read().split()


	def clean(self, string):
		""" remove any nasty grammar tokens from string """
		string = string.replace(".","")
		string = string.replace("\s+"," ")
		string = string.lower()
		return string
	

	def removeStopWords(self,list):
		""" Remove common words which have no search value """
		return [word for word in list if word not in self.stopwords ]


	def tokenise(self, string):
		""" break string up into tokens and stem words """
		string = self.clean(string)
		words = string.split(" ")
		
		return [self.stemmer.stem(word,0,len(word)-1) for word in words]
Пример #7
0
def Word_appear_count(text, type, Word_count_pubmed, Word_count_twitter, Word_count_all):
    text = remove_tag(text)
    word = text.split()
    p = PorterStemmer()
    for i in word:
        i = p.stem(i, 0, len(i) - 1)  # porter

        # pubmed
        if i not in Word_count_pubmed.keys():
            Word_count_pubmed[i] = 0
            if type == 'pubmed':
                Word_count_pubmed[i] += 1
        elif i in Word_count_pubmed.keys() and type == 'pubmed':
            Word_count_pubmed[i] += 1

        # twitter
        if i not in Word_count_twitter.keys():
            Word_count_twitter[i] = 0
            if type == 'twitter':
                Word_count_twitter[i] += 1
        elif i in Word_count_twitter.keys() and type == 'twitter':
            Word_count_twitter[i] += 1

        # all
        if i not in Word_count_all.keys():
            Word_count_all[i] = 1
        elif i in Word_count_all.keys():
            Word_count_all[i] += 1

    return Word_count_pubmed, Word_count_twitter,Word_count_all
Пример #8
0
def stem(tokens):
    p = PorterStemmer()
    stems = []
    for token in tokens:
        stem = p.stem(token, 0, len(token) - 1)
        stems.append(stem)

    return list(filter(None, stems))
def stemWords(inList):
##Function that stems the	words.
##Name: stemWords; input:	list (of tokens); output: list	(of stemmed tokens)
    outlist = []
    p = PorterStemmer()
    for word in inList:
        outlist.append(p.stem(word, 0, len(word)-1))
    return outlist
def dict_qryid_terms(is_stopping):
  global STOPWORDS_FILE 
  stopwords_list = stopwords(STOPWORDS_FILE)  ## create stopwords list
  p = PorterStemmer() ##create an Porter Stemmer instance 
  dictquery = defaultdict(lambda: [])  ## create the target dictionary
  with open(QUERY_TEXT_FILE, 'r') as f: 
    for line in f: 
      data_list = re.findall(r"[\w]+", line)
      query_id = data_list[0]
      for term in data_list[1:]:
        term = term.lower()
        if is_stopping:
          if term not in stopwords_list:
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
        else: 
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
  return dictquery
Пример #11
0
def dict_qryid_terms(is_stopping):
  global STOPWORDS_FILE 
  stopwords_list = stopwords(STOPWORDS_FILE)  ## create stopwords list
  p = PorterStemmer() ##create an Porter Stemmer instance 
  dictquery = defaultdict(lambda: [])  ## create the target dictionary
  with open(QUERY_TEXT_FILE, 'r') as f: 
    for line in f: 
      data_list = re.findall(r"[\w]+", line)
      query_id = data_list[0]
      for term in data_list[1:]:
        term = term.lower()
        if is_stopping:
          if term not in stopwords_list:
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
        else: 
            dictquery[query_id].append(p.stem(term, 0,len(term)-1))
  return dictquery
Пример #12
0
def stem_string(line):
    if line == "":
        return ""
    p = PorterStemmer()
    word = ""
    output = ""
    for c in line:
        if c.isalpha():
            word += c.lower()
        else:
            if word:
                output += p.stem(word, 0,len(word)-1)
                word = ''
            output += c.lower()
    if word:
        output += p.stem(word, 0,len(word)-1)
    return output
def stem_words(list_of_tokens):
    stemmer = PorterStemmer()  # Declares the stemmer object
    for token_index, token in enumerate(list_of_tokens):
        list_of_tokens[token_index] = stemmer.stem(
            token, 0,
            len(token) - 1)  # Stems the word using the function

    return list_of_tokens  # Returns the "post-stem" list of tokens
Пример #14
0
 def stemInputAndCheckMatch(self, uType, word):
     ps = PorterStemmer()
     stemmedWord = ps.stem(word)
     matchingWords = self.checkMatches(uType, stemmedWord)
     data = self.getMostFrequentWords(matchingWords)
     if (data[1] != 1):
         return data[0]
     else:
         return []
 def stemming(self, tokens):
     stemmed_tokens = []
     stem_func = PorterStemmer()
     for c in tokens:
         if c.isalpha():
             stemmed_tokens.append(stem_func.stem(c, 0,len(c)-1))
         else:
             stemmed_tokens.append(c)
     return stemmed_tokens
Пример #16
0
def getTopTerms(currentQuery, weightsMap, topX):

    p = PorterStemmer()
    current_terms = []
    for term in currentQuery.split():
        term = p.stem(term.lower(), 0,len(term)-1)
        current_terms.append(term)    

    i = 0
    new_terms = []
    for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
        if term in constants.QUERY_SKIP_TERMS or p.stem(term.lower(), 0,len(term)-1) in current_terms:
            continue
        new_terms.append(term)
        current_terms.append(p.stem(term.lower(), 0,len(term)-1))
        i = i + 1
        if (topX != 'ALL' and i >= topX):
            break;
    return new_terms
def load_dictionary(filename, stem=True):
    """Loads line separated dictionary into a list"""
    out = []
    for word in open("dictionaries/%s" % filename, "r"):
        word = word.lower()
        if stem is True:
            p = PorterStemmer()
            word = p.stem(word, 0,len(word)-1)               
        out.append(word)
    return out
Пример #18
0
    def preprocess(self, query):
        p = PorterStemmer()
        result = []

        # remove any non-alphanumeric characters [a-zA-Z0-9_]
        query = re.sub("[^\w]", " ", query)
        query = query.lower().split(' ')
        for word in query:
            if word not in self.stopwords:
                result.append(p.stem(word, 0, len(word) - 1))
        return result
def format_description(text, stop_words):
    words = text.split()

    stemmer = PorterStemmer()
    non_stop_words = []
    for word in words:
        if word not in stop_words:      # Not a stop word, so lower, remove punctuation, and stem
            lowered_token = remove_punctuation(word).lower()
            #non_stop_words.append(lowered_token)
            non_stop_words.append(stemmer.stem(lowered_token))

    return ' '.join(non_stop_words)
Пример #20
0
def getTopTerms(currentQuery, weightsMap, topX):

    p = PorterStemmer()
    current_terms = []
    for term in currentQuery.split():
        term = p.stem(term.lower(), 0, len(term) - 1)
        current_terms.append(term)

    i = 0
    new_terms = []
    for term in sorted(weightsMap, key=weightsMap.get, reverse=True):
        if term in constants.QUERY_SKIP_TERMS or p.stem(
                term.lower(), 0,
                len(term) - 1) in current_terms:
            continue
        new_terms.append(term)
        current_terms.append(p.stem(term.lower(), 0, len(term) - 1))
        i = i + 1
        if (topX != 'ALL' and i >= topX):
            break
    return new_terms
Пример #21
0
def finalize(tInput, swInput):
    p = PorterStemmer()
    output = open("output.txt", 'w')
    for i in range(len(tInput)):
        token = tInput[i]
        if token == "a" or token == "an" or token == "the":
            output.write("%s\t- article\n" % token)
        elif any(token in x for x in swInput):
            output.write("%s\t- stop word\n" % token)
        else:
            stemword = p.stem(token, 0, len(token) - 1)
            output.write("%s\t- %s\n" % (token, stemword))
    output.close()
Пример #22
0
    def __init__(self, parent, docno, doc, terms):
        QtGui.QDialog.__init__(self, parent)

        self.setupUi(self)

        # Set fields
        self.labelDocumentNo.setText(docno)

        textDocument = self.textEdit.document()
        textCursor = QtGui.QTextCursor(textDocument)

        normalFormat = QtGui.QTextCharFormat()
        termFormat = QtGui.QTextCharFormat()
        termFormat.setForeground(QtGui.QBrush(QtGui.QColor("red")))
        termFormat.setFontWeight(QtGui.QFont.Bold)

        textCursor.beginEditBlock()

        stemmer = PorterStemmer()
        terms = terms.split(",")
        stemmed_terms = [
            stemmer.stem(term, 0,
                         len(term) - 1) for term in terms
        ]

        for line in unicode(doc).split("\n"):
            for word in line.split(" "):
                nword = word.lower().strip(punctuation)
                sword = stemmer.stem(nword, 0, len(nword) - 1)
                if nword in terms or sword in stemmed_terms:
                    textCursor.insertText(word, termFormat)
                else:
                    textCursor.insertText(word, normalFormat)
                textCursor.insertText(" ", normalFormat)

            textCursor.insertText("\n", normalFormat)

        self.textEdit.moveCursor(QtGui.QTextCursor.Start)
Пример #23
0
 def stem_text(text):
     p = PorterStemmer()
     stemmed_text = ''
     word = ''
     for i, c in enumerate(text):
         if c.isalpha():
             word += c.lower()
         if not c.isalpha() or i == (len(text) - 1):
             if word:
                 stemmed_text += p.stem(word, 0,len(word)-1)
                 word = ''
             if c.lower() == ' ':
                 stemmed_text += c.lower()
     return stemmed_text
Пример #24
0
def remove_porterstemmer(input_file, noise_words_set):
    questions = list()
    word_weight = []
    p = PorterStemmer()
    for line in input_file:
        line = line.lower()
        words = filter(None, re.split("\W*\d*", line))
        question = []
        for word in words:
            new_word = p.stem(word, 0, len(word) - 1)
            if new_word not in noise_words_set and len(new_word) > 2:
                question.append(new_word)
        questions.append(question)
        word_weight.append(Counter(question))
    return word_weight, questions
def remove_porterstemmer(input_file,noise_words_set):
	questions = list()
	word_weight = []
	p = PorterStemmer()
	for line in input_file:
		line = line.lower()
		words = filter(None, re.split("\W*\d*", line))
		question = []
		for word in words:
			new_word = p.stem(word,0,len(word)-1)
			if new_word not in noise_words_set and len(new_word)>2:
				question.append(new_word)
		questions.append(question)
		word_weight.append(Counter(question))
	return word_weight, questions
Пример #26
0
def en_preprocess(file_path: str, stop_words: list, step: int = 4) -> str:
    '''
    Step1: Extract pure-text content from the original html file
    Step2: To lower case, remove special characters
    Step3: Remove stop words
    Step4: Porter stemming (Final result)
    '''
    with open(file_path, "r", encoding="UTF-8") as f:
        html_content = f.read()
        parsed_content = BeautifulSoup(html_content, 'html.parser')
        text_content = ""
        # Extract pure-text content from the original html file
        for child in parsed_content.find(id="mw-content-text").div.children:
            if child.name in ("p", "h2", "h3", "h4", "h5"):
                text_content += child.get_text()
        if step == 1:
            return text_content
        # To lower case
        text_content = text_content.lower()
        # Remove special characters
        text_content = text_content.replace("'", "")
        text_content = text_content.replace("-", "")
        for i in range(len(text_content)):
            curr_char = text_content[i]
            if not ((curr_char >= 'a' and curr_char <= 'z')):
                text_content = text_content.replace(curr_char, " ")
        # Remove duplicated spaces
        text_content = re.sub("[ ]+", " ", text_content)
        if step == 2:
            return text_content
        # Tokenize
        token_list = text_content.split(" ")
        # Remove stop words
        new_list = []
        for token in token_list:
            if token not in stop_words and token != "":
                new_list.append(token)
        token_list = new_list
        if step == 3:
            return " ".join(token_list)
        # Porter stemming
        p = PorterStemmer()
        new_list = []
        for i in range(len(token_list)):
            new_list.append(p.stem(token_list[i], 0, len(token_list[i]) - 1))
        token_list = new_list
        final_result = " ".join(token_list)
        return final_result
Пример #27
0
def tokenize(documents):
    # Read the stopwords
    stop_word_set = set(open('./stopwords.txt', 'r').read().split())
    # Initialize the Porter stemmer
    p = PorterStemmer()
    # Create a dictionary where each element is also a dictionary. The outer dictionary will map stemmed words to
    # document ids and the inner dictionaries will map the document ids to their indices in the document.
    word_to_doc = defaultdict(lambda: defaultdict(list))  # Positional inverted index
    for document_index, document in enumerate(documents, start=1):
        for word_index, word in enumerate(document.split()):
            if word not in stop_word_set:
                # Store each word as stemmed and put them to the inverted index
                stemmed_word = p.stem(word, 0, len(word) - 1)
                # stemmed_word = word
                word_to_doc[stemmed_word][document_index].append(word_index)
    return word_to_doc
Пример #28
0
def search_dic(text, SearDic, original_word, index):
    text = remove_tag(text)
    word = text.split()
    p = PorterStemmer()
    for i in word:

        # poter_i = i
        poter_i = p.stem(i, 0, len(i) - 1)  # porter
        if poter_i not in SearDic.keys():
            SearDic[poter_i] = [index]
            original_word[poter_i] = [i]
        else:
            if index not in SearDic[poter_i]:
                SearDic[poter_i].append(index)
                if i not in original_word[poter_i]:
                    original_word[poter_i].append(i)
    return SearDic, original_word
Пример #29
0
def getQuestionKeywords(question):
    """Return the keywords from a question.

    The logic is: remove the stop words and punctuations from question, stem the keywords and remove duplicates
    Currently there are still issues with
    1. stop words list is not complete: eg "recommend" etc is not a stop word.
    2. stemmer issue: The current stemmer utility has an issue eg "restaurant" is stemmed to "restau"

    >>> getQuestionKeywords('what is the best preschool in Potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('Can someone help with a preschool around potomac?')
    ['potomac', 'preschool']

    >>> getQuestionKeywords('What is the best cafeteria around potomac?')
    ['potomac', 'restaurant']

    """

    # split the question into a list
    keywordList = question.split()

    # strip the punctuations etc
    keywordList = [keyword.strip(PUNCTUATION) for keyword in keywordList]

    # convert into lower case
    keywordList = [keyword.lower() for keyword in keywordList]

    #remove stop words from keywords
    keywordList = [keyword for keyword in keywordList if keyword not in stopWords]

    #stem the keywords
    stemmer = PorterStemmer()
    keywordList = [stemmer.stem(keyword,0,len(keyword)-1) for keyword in keywordList]

    #take care of synonyms
    keywordList = [synonyms[keyword] if keyword in synonyms else keyword for keyword in keywordList ]

    #remove duplicates
    keywordList = list(set(keywordList))

    #sort the keywords
    keywordList.sort()
    
    return keywordList
Пример #30
0
class Tokenizer:
    """ Helper class for tokenizing document space and removing stop words """

    corpus = None
    terms = []
    stop_words = []
    stemmer = None

    def __init__(self):

        # read stop words from file
        self.stop_words = open('stop_words.txt', 'r').read().split()
        self.stemmer = PorterStemmer()

    def tokenize(self, docs_string):
        """ Tokenizer's most important method.
        It separates the whole corpus string in tokens and
        removes stop words.
        """
        self.corpus = docs_string

        self.clean()

        self.terms = self.corpus.split(" ")

        self.remove_stop_words()

        self.remove_duplicates()

        return self.terms

    def clean(self):
        """ get rid of punctuation signs, convert to lower case, standardize spacing """
        self.corpus = self.corpus.replace(".", " ")
        self.corpus = self.corpus.replace(",", " ")
        self.corpus = self.corpus.lower()
        self.corpus = self.corpus.replace("\s+", " ")

    def remove_stop_words(self):
        self.terms = [self.stemmer.stem(term,0,len(term)-1) for term in self.terms if term not in self.stop_words]

    def remove_duplicates(self):
        """ remove duplicated terms in the list """
        from sets import Set
        self.terms = Set((term for term in self.terms))
Пример #31
0
 def parse(self):
     #remove stop words
     self.dataList = [w for w in self.dataList if not w in self.stopWords]
     #get the stem of the words
     st = PorterStemmer()
     self.dataList = [st.stem(w, 0, len(w)-1) for w in self.dataList]        
     # add to list based on frequency of occurrence
     wordFreq = {}
     for word in self.dataList:
         if word in wordFreq:
             wordFreq[word] = wordFreq[word] + 1
         else:
             wordFreq[word] = 0
     wordList = sorted(wordFreq.iteritems(), key = operator.itemgetter(1))
     newList = []
     for w in wordList:
         newList.insert(0,w[0])
     self.dataList = newList
Пример #32
0
class Processor:
    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open('stop.words.dat', 'r') as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != '' and num_records != 0:
            self.process(path, num_records)

    def process(self, path, num_records):
        with open(path, 'r', encoding='utf-8') as src:
            with open('sample.txt', 'w') as dst:
                num_total = 0
                for line in src:
                    AnonID, Query, QueryTime = line.split('\t')[:3]

                    if AnonID == 'AnonID':
                        continue

                    if num_total < num_records:
                        tidy = self.trim(Query)
                        if tidy != '':
                            Query = self.remove_stop_words(tidy)
                            Query = self.porter_stemming(Query)
                            if Query != '':
                                dst.write('{}\t{}\t{}\n'.format(
                                    AnonID, Query, QueryTime))
                                num_total += 1

    def trim(self, string):
        return re.sub(r'\W', ' ', string)

    def remove_stop_words(self, string):
        words = string.split()
        return ' '.join([w for w in words if w not in self.stop])

    def porter_stemming(self, string):
        result = [
            self.porter.stem(word, 0,
                             len(word) - 1) for word in string.split()
        ]
        return ' '.join(result)
Пример #33
0
	def classify(self, query):
		if self.isSuicide(query):
			return [('suicidal ideation', 1), ('depression', .5), ('emotional disturbance', .5)]

		query = "".join(c for c in query if c not in ('!','.',':',',',';','?')).lower()
		query_words = query.split() 
		p = PorterStemmer()
		query_words = [p.stem(query_words[i]) for i in range(len(query_words))]
		q = np.zeros(len(self.word_to_index))
		for word in query_words:
			if word in self.word_to_index:
				q[self.word_to_index[word]] += self.idf[self.word_to_index[word]]

		membership_scores = []
		for i in range(len(self.tfidf_matrix)):
			#compute cosine similarity
			docvec = self.tfidf_matrix[i]
			cossim = (np.inner(docvec, q)/(np.linalg.norm(docvec)*np.linalg.norm(q))).item(0,0)
			membership_scores.append(cossim)
		return sorted(zip(self.categories, membership_scores), key=lambda x: x[1], reverse=True)
Пример #34
0
class Processor:
    def __init__(self, path, num_records):
        self.porter = PorterStemmer()
        self.stop = set()
        with open("stop.words.dat", "r") as sw:
            for line in sw:
                self.stop.add(line[:-1])

        if path != "" and num_records != 0:
            self.process(path, num_records)

    def process(self, path, num_records):
        with open(path, "r", encoding="utf-8") as src:
            with open("sample.txt", "w") as dst:
                num_total = 0
                for line in src:
                    AnonID, Query, QueryTime = line.split("\t")[:3]

                    if AnonID == "AnonID":
                        continue

                    if num_total < num_records:
                        tidy = self.trim(Query)
                        if tidy != "":
                            Query = self.remove_stop_words(tidy)
                            Query = self.porter_stemming(Query)
                            if Query != "":
                                dst.write("{}\t{}\t{}\n".format(AnonID, Query, QueryTime))
                                num_total += 1

    def trim(self, string):
        return re.sub(r"\W", " ", string)

    def remove_stop_words(self, string):
        words = string.split()
        return " ".join([w for w in words if w not in self.stop])

    def porter_stemming(self, string):
        result = [self.porter.stem(word, 0, len(word) - 1) for word in string.split()]
        return " ".join(result)
Пример #35
0
def dicts_docid_words_docid_doclen():
  global STOPWORDS_FILE 
  p = PorterStemmer() 
  stopwords_list = stopwords(STOPWORDS_FILE)
  docid_words_dict = defaultdict(lambda: [])
  docid_doclen_dict = {}
  path = CACM_PATH
  """extract all the file names in the path and put them into a list"""
  dirs_list = os.listdir(path)
  for docname in dirs_list:
    docno = ''.join([s for s in docname if s.isdigit()])
    f = urllib.urlopen(path+docname).read()
    data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1)
    data = re.findall(r"[\w]+", data)
    for word in data:
      word = word.lower()  
  #    if word not in stopwords_list:  
      word_stemmed = p.stem(word, 0,len(word)-1)
      docid_words_dict[docno].append(word_stemmed)
    """doclen is the length of doc after stopping and stemming"""
    docid_doclen_dict[docno]=len(data)  
  return docid_words_dict,docid_doclen_dict
def dicts_docid_words_docid_doclen():
  global STOPWORDS_FILE 
  p = PorterStemmer() 
  stopwords_list = stopwords(STOPWORDS_FILE)
  docid_words_dict = defaultdict(lambda: [])
  docid_doclen_dict = {}
  path = CACM_PATH
  """extract all the file names in the path and put them into a list"""
  dirs_list = os.listdir(path)
  for docname in dirs_list:
    docno = ''.join([s for s in docname if s.isdigit()])
    f = urllib.urlopen(path+docname).read()
    data = re.compile(r'.*?<pre>(.*?)([0-9]+\t[0-9]+\t[0-9]+)', re.DOTALL).match(f).group(1)
    data = re.findall(r"[\w]+", data)
    for word in data:
      word = word.lower()  
      if word not in stopwords_list:  
        word_stemmed = p.stem(word, 0,len(word)-1)
        docid_words_dict[docno].append(word_stemmed)
    """doclen is the length of doc after stopping and stemming"""
    docid_doclen_dict[docno]=len(data)  
  return docid_words_dict,docid_doclen_dict
Пример #37
0
def useSynonyms():
    # Using Thesures(Synonyms)
    global synonyms
    global synonyms_list
    synonyms = {}
    synonyms_list = []
    useThesures = True
    if useThesures:
        p = PorterStemmer()
        with open("../txt_files/synonyms_short.txt", 'r') as f:
            lines = f.read().split('$')
            index = 0
            for line in lines:
                words_list = []
                words = line.split(',')
                for word in words:
                    word = word.strip()
                    if re.search('[a-zA-Z]', word):
                        word = word.lower()
                        word = p.stem(word, 0, len(word) - 1)
                        words_list.append(word)
                        synonyms[word] = index
                synonyms_list.append(words_list)
                index += 1
Пример #38
0
class Parser:
    stemmer = None
    stopwords = []

    def __init__(self, ):
        self.stemmer = PorterStemmer()
        self.stopwords = open('english.stop', 'r').read().split()

    def clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = string.replace(".", "")
        string = string.replace("\s+", " ")
        string = string.lower()
        return string

    def remove_stop_words(self, list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords]

    def tokenise(self, string):
        """ break string up into tokens and stem words """
        string = self.clean(string)
        words = string.split(" ")
        return [self.stemmer.stem(word, 0, len(word) - 1) for word in words]
Пример #39
0
from pprint import pprint
import math

corpus = ["At work", "New job", "Enjoying", "Beer", "Days off", "wedding", "Office", "Drinks", "Wine", "Drinks", "Blessed", "A drink", "Hubby", "Much needed", "New place", "Thankful", "apartment", "Excited about", "Vacation", "Celebrate", "Let me know", "Had a blast", "laundry", "care of", "company", "Grocery", "Wishes", "Drinking for eveveryone", "After work", "To work tommorow", "Bills", "taxes", "Husband", "shift", "The bar", "Potty", "ready to", "Celebrating", "To enjoy", "My babies", "Errands", "Relaxing", "apt", "Fingers crossed", "Poor baby", "Day to all", "women", "Work", "Yard", "Doesn't", "Uni", "Days", "Volunteer", "Schedule", "repeat", "House", "Apartment", "Moving", "place", "Rent", "Move", "Month", "Bedroom", "Lease", "Signed", "Roommate", "Interested", "Complex", "Area", "Interest", "apt", "Drinking", "Beer", "Drink", "Cold", "Root", "Beers", "Pong", "Ale", "Ginger", "Cans", "Drinkin", "ginger", "Pint", "Cans", "Bbq", "Pub", "bottles", "Home", "Work", "Ready", "Hubby", "Bed", "Dinner", "relax", "Shower", "Heading", "Relaxing", "Chill", "Nap", "Early", "Supper", "Snuggle", "Money", "Pay", "Bills", "Paid", "Paying", "Bill", "Job", "Month", "Rent", "Check", "Taxes", "Bucks", "Debt", "paycheck", "job", "Position", "Company", "Interview", "Experience", "Manager", "Assistant", "Interested", "Career", "Business", "Resume", "Sales", "Hiring", "Hire"]
stoplist = set('for a of the and to in'.split())

stemmer = PorterStemmer()

texts = [[word for word in string.lower().split() if word not in stoplist]
			for string in corpus]

words = reduce(list.__add__, texts)

stems = []
for word in words:
	stem = stemmer.stem(word)
	stems.append(stem)

stemCounts = {}

numStems = len(stems)
for word in stems:
	if word not in stemCounts:
		stemCounts[word] = 1.0
	else:
		stemCounts[word] = stemCounts[word] + 1.0


for word in stemCounts:
	stemCounts[word] = stemCounts[word]/numStems;
	stemCounts[word] = float("{0:.3f}".format(stemCounts[word]))
Пример #40
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]


    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        # -------------------------------------------------------------------
        print "Calculating tf-idf..."
        self.tfidf = {}
        idf = {}
        doc_cont = {}
        
        for i, doc in enumerate(self.docs):
            doc_cont[i] = collections.Counter(doc)
        
        for word in self.vocab:
            word_set = 0.0 + len(IRSystem.get_posting(self, word))
            idf[word] = math.log10(len(self.docs) / word_set)
            if word not in self.tfidf:
                self.tfidf[word] = {}
            for d in range(len(self.docs)):
                tf = doc_cont[d][word]
                if tf == 0.0:
                    self.tfidf[word][d] = 0.0
                else:
                    self.tfidf[word][d] = (1.0 + math.log10(tf)) * idf[word]
    

    def get_tfidf(self, word, document):
        """ Return the tf-idf weigthing for the given word (string) and
            document index.
        """
        tfidf = self.tfidf[word][document]
        return tfidf


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."

        inv_index = {}
    
        # Create a list for each word
        for word in self.vocab:
                inv_index[word] = []
        
        # Copy the index of document where the word is
        for i, doc in enumerate(self.docs):
            for word in set(doc):
                    inv_index[word].append(i)
    
        self.inv_index = inv_index


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        posting = []
        
        # Return the list of the given word
        posting = self.inv_index[word]
        set(posting)
        sorted(posting)

        return posting


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """

        docs = []
        words_list = []
        
        # Store in words_list the inv_index of each word of the query
        for word in query:
            words_list.append(set(self.inv_index[word]))
                    
        # Intersect the words_list in a list with the common documents
        docs = reduce (lambda x,y: x & y, words_list)

        return sorted(docs)   # sorted doesn't actually matter


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for xx in range(len(self.docs))]
 
        q_count = {}

        # Calculate a counter of term-frecuency in query
        q_count = collections.Counter(query)

    
        for d, doc in enumerate(self.docs):
            intersec = set(query).intersection(set(doc))
            numerator = 0.0
            denominator = 0.0
            for word in intersec:
                qt = (1.0 + math.log10(q_count[word]))
                dt = self.get_tfidf(word,d)
                numerator = numerator + qt*dt
            
            for word in set(doc):
                dd = self.get_tfidf(word,d)
                denominator = denominator + dd*dd

            scores[d] = numerator/math.sqrt(denominator)

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #41
0
file = open('english.stop')
StopWords = set()
for word in file:
	word = word.strip()
	if word != '':
		StopWords.add(word)

for line in sys.stdin:
	line = line.strip().split('\t',2)
	if len(line) != 2:
		continue
	else:
		title = line[0]
		text = line[1]
		#print text
		p = PorterStemmer() 
		word = ''
		if text == '':
		    continue
		for c in text:
		    if c.isalpha():
			word += c.lower()
		    else:
			if word:
				if word not in StopWords:
			    		output = p.stem(word, 0,len(word)-1)
					print "%s@-@%s\t1" %(output, title)
			word = ''

def main(filename, crashed=False): #"crashed" is an option to continue from the current state if the requests time out
	#print filename
	#print crashed

	h = html2text.HTML2Text()
	h.ignore_links = True
	stemmer = PorterStemmer()
	tf_folder_path = os.path.join(os.getcwd(), 'tf')
	if not os.path.exists(tf_folder_path):
		os.mkdir(tf_folder_path)
	corpus = set()
	pause_time = 0.5

	if check_validity:
		valid_words = set(str(stemmer.stem(line.rstrip().lower())) for line in open(valid_words_file, 'r'))
		stop_words = set(str(stemmer.stem(line.rstrip().lower())) for line in open(stop_words_file, 'r'))
		keywords = set(str(stemmer.stem(word.lower()) for word in extra_search_keywords.split()))
		stop_words = stop_words.union(keywords)

	if logging:
		log = open('tf-log', 'w')

	# Step 1: Find all distinct specialty classes.
	connection = sqlite3.connect(db_name)
	c = connection.cursor()
	db.select(c, ['specialty'], 'th_specialties', distinct=True)
	issues = set(str(re.sub(r'[^a-zA-Z]+', ' ', i[0])).lower() for i in c.fetchall())
	connection.close()
	if logging:
		log.write("Issues: \n")
		log.write(', '.join(issues))
		log.write('\n\n')

	print "Step 1 complete."

	# Step 2: For each category, find the top num_articles google results and generate tf counts of the stemmed plaintext.

	if crashed:
		completed = set(f for f in os.listdir(tf_folder_path) if os.path.isfile(os.path.join(tf_folder_path, f)))
		issues = issues - completed
		#print issues

	for issue in issues:
		results = search(issue + ' ' + extra_search_keywords, stop = num_articles, pause = pause_time)
		urls = [str(url) for url in results][: num_articles]
		
		if logging:
			print issue
			log.write('Issue: ' + issue + '\n')
			log.write('\n'.join(urls))
			log.write('\n\n')

		cumulative = []

		for url in urls:
			if not url.endswith('.pdf'):
				try:
					html = urllib2.urlopen(url) #gets the raw html of the url
					plaintext = h.handle(unicode(html.read(), 'ISO-8859-1')) #converts the html into plaintext
					processed = re.sub(r'[^a-zA-Z]+', ' ', plaintext)
					if check_validity:
						for word in processed.split():
							processed = str(stemmer.stem(word.lower()))
							if processed not in stop_words and processed in valid_words:
								cumulative.append(processed)
					else:
						stemmed = [str(stemmer.stem(word.lower())) for word in processed.split()]
						cumulative += stemmed
				except: #mostly to ignore urllib2 errors...
					pass
		counts = Counter(cumulative)
		tf = open(os.path.join(tf_folder_path, issue), 'w')

		for word in sorted(counts.keys()): #sort words in alphabetical order
			corpus.add(word)
			tf.write(str((word, counts[word]))) #write tuples of words with the word count
			tf.write('\n')

		tf.close()

	print "Step 2 complete."

	# Step 3: Combine files

	files = sorted(issues)
	num_files = len(files)
	count_vectors = {}
	for word in corpus:
		count_vectors[word] = [0]*num_files

	# Flesh out count_vectors
	for i in range(len(files)):
		curr = open(os.path.join(tf_folder_path, files[i]), 'r')
		for line in curr.readlines():
			pair = ast.literal_eval(line)
			count_vectors[pair[0]][i] = pair[1]
		curr.close()

	# Write to tf_matrix
	tf_matrix = open(filename, 'w')
	tf_matrix.write(','.join(files))
	tf_matrix.write('\n')

	for word in sorted(count_vectors.keys()):
		line = word + ',' + ','.join([str(num) for num in count_vectors[word]])
		tf_matrix.write(line)
		tf_matrix.write('\n')
	tf_matrix.close()

	shutil.rmtree(tf_folder_path) #removes intermediates!

	print "Step 3 complete."
		
	if logging:
		log.close()
Пример #43
0
class Indexer(object):

    def __init__(self):
        self.dname2id = pickle.load(open('doc2id.pkl', 'rb'))
        try:
            f = open('stopword_list.txt', 'r')
        except IOError:
            raise 'Failed to open stopword_list.txt.'

        self.stoplist = f.read().split()
        self.porter = PorterStemmer()
        ## term to its posting list.
        self.index = {}
        self.pos_index = defaultdict(list)
        self.doc_num = len(self.dname2id)

    def terms_for_keywords_query(self, terms):
        ## Filter out stop words.
        return [t for t in terms if t not in self.stoplist]

    def get_terms(self, contents):
        terms = contents.split()
        terms = map(del_punc, terms)
        terms = map(lambda s : s.lower(), terms)

        ## Terms for keywords based query(aka: free text query).
        terms_for_kq = [self.porter.stem(term, 0, len(term)-1) for term in self.terms_for_keywords_query(terms)]

        ## Terms for phrase query.
        terms_for_pq = [self.porter.stem(term, 0, len(term)-1) for term in terms]

        return terms_for_kq, terms_for_pq

    def get_doc_id(self, dname):
        return self.dname2id[dname]

    def build_posting_list_for_pq(self, terms, doc_id):
        """
        Build posting list(term : [doc, [positions]]) for phrase query.
        """
        term2doc_pos = {}
        for pos, term in enumerate(terms):
            try:
                term2doc_pos[term][1].append(pos)
            except:
                term2doc_pos[term] = [doc_id, [pos]]

        for term, posting in term2doc_pos.iteritems():
            self.pos_index[term].append(posting)

    def build_posting_list_for_kq(self, terms, doc_id):
        """
        Build posting list(term : [idf, [(doc1, tf), (doc2, tf), ...]]) for keywords based query.
        """
        tf_counter = Counter(terms)
        max_elem = tf_counter.most_common(1)
        most_common_term = max_elem[0][0]
        max_tf = max_elem[0][1]
        # print 'Most common term is:', most_common_term, '\tMax tf is:', max_tf

        for term, tf in tf_counter.iteritems():
            if not self.index.has_key(term):
                df = 1
                self.index[term] = [df, [(doc_id, float(tf)/max_tf)]]
            else:
                df = self.index[term][0]
                df += 1
                self.index[term][0] = df
                self.index[term][1].append((doc_id, float(tf)/max_tf))

    def write_index_to_file(self):
        pickle.dump(self.index, open('index.pkl', 'wb'))
        pickle.dump(self.pos_index, open('pos_index.pkl', 'wb'))

    def compute_idf(self):
        for term, postings in self.index.iteritems():
            postings[0] = log(float(self.doc_num)/postings[0], 2)

    def parse_collection(self):

        stdout_old = sys.stdout
        sys.stdout = open('indexer_log', 'w')
        print 'Total %d documents need to be processed.' % self.doc_num

        for index, (doc_name, doc_id) in enumerate(sorted(self.dname2id.iteritems(), key=itemgetter(1))):
            try:
                print 'Building index for:', os.path.basename(doc_name),
                print '\tDocument ID:', doc_id
                f = open(doc_name, 'r')
            except IOError:
                raise 'Unable to open document [%s]' % doc_name

            ## Get terms for keywords based query and phrase based query.
            terms_for_kq, terms_for_pq = self.get_terms(f.read())
            
            self.build_posting_list_for_kq(terms_for_kq, doc_id)
            self.build_posting_list_for_pq(terms_for_pq, doc_id)

        self.compute_idf()
        self.write_index_to_file()

        sys.stdout = stdout_old
Пример #44
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]


    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # TODO: Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        #       NOTE that you probably do *not* want to store a value for every
        #       word-document pair, but rather just for those pairs where a
        #       word actually occurs in the document.
        print "Calculating tf-idf..."
        self.tfidf = {}
        N = len(self.docs)
        for word in self.vocab: 
            if word not in self.tfidf: 
                self.tfidf[word] = {} 
            idf = math.log10(N*1./len(self.inv_index[word]))
            for index,d in enumerate(self.inv_index[word]):
                tf = math.log10(1.*len(self.inv_index[word][d]))
                self.tfidf[word][d] = (1+tf)*idf

        # Calculate per-document l2 norms for use in cosine similarity
        # self.tfidf_l2norm[d] = sqrt(sum[tdidf**2])) for tdidf of all words in 
        # document number d
        tfidf_l2norm2 = {}
        for word, d_dict in self.tfidf.items():
            for d,val in d_dict.items():
                tfidf_l2norm2[d] = tfidf_l2norm2.get(d, 0.0) + val ** 2
        self.tfidf_l2norm = dict((k,math.sqrt(v)) for k,v in tfidf_l2norm2.items())               



        # ------------------------------------------------------------------
        # The term frequency tft,d of term t in document d is defined as the number of times that t occurs in d.

    def get_tfidf(self, word, document):
        # ------------------------------------------------------------------
        # TODO: Return the tf-idf weigthing for the given word (string) and
        #       document index.
        if self.tfidf[word][document] is not None: 
            return self.tfidf[word][document]
        else:
            return 0


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        # ------------------------------------------------------------------
        # TODO: Create an inverted index.
        #       Granted this may not be a linked list as in a proper
        #       implementation.
        #       Some helpful instance variables:
        #         * self.docs = List of documents
        #         * self.titles = List of titles

        inv_index = {}
        for i,title in enumerate(self.titles):
            for j,word in enumerate(self.docs[i]):
                if not word in inv_index:
                    inv_index[word] = {}
                if not i in inv_index[word]:
                    inv_index[word][i] = []
                inv_index[word][i].append(j)

        self.inv_index = inv_index

        # ------------------------------------------------------------------


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        # ------------------------------------------------------------------
        # TODO: return the list of postings for a word.
        posting = self.inv_index[word].keys()
        return posting
        # ------------------------------------------------------------------


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        # ------------------------------------------------------------------
        # TODO: Implement Boolean retrieval. You will want to use your
        #       inverted index that you created in index().
        # Right now this just returns all the possible documents!
        docs = []
        for d in range(len(self.docs)):
            docs.append(d)

        docsets = set(docs)
        for q in query:
            docsets &= set(self.inv_index[q].keys())
        docs = list(docsets)

        # ------------------------------------------------------------------

        return docs   # sorted doesn't actually matter


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for xx in range(len(self.docs))]
        # ------------------------------------------------------------------
        # TODO: Implement cosine similarity between a document and a list of
        #       query words.

        # Right now, this code simply gets the score by taking the Jaccard

        # similarity between the query and every document.
        """
        words_in_query = set()
        for word in query:
            words_in_query.add(word)

        for d, doc in enumerate(self.docs):
            words_in_doc = set(doc)
            scores[d] = len(words_in_query.intersection(words_in_doc)) \
                    / float(len(words_in_query.union(words_in_doc)))

        # ------------------------------------------------------------------

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results
        """
        wordvec = {}
        for word in query:
            wordvec[word] = wordvec.get(word,0) + 1
        wordvec = dict((word, math.log10(wordvec[word])+1.) for word in wordvec)
        def get_score(d):
            """Return score for document d
                This is cos(query_vec * d_vec/norm) where 
                    d_vec[word] = tfidf of word in doc number d 
                    norm = sqrt(d_vec[w]**2) for all words w in doc number d
            """
            
            d_vec = dict((word, self.tfidf[word].get(d,0.0)) for word in wordvec)    
            return sum(wordvec[word] * d_vec[word] for word in d_vec)/self.tfidf_l2norm[d]
        
        # Compute scores and add to a priority queue
        scores = []
        for d in range(len(self.docs)):
            heapq.heappush(scores, (get_score(d), d))
        # Return top 10 scores
        return [(k,v) for v,k in heapq.nlargest(10,scores)]


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #45
0
class IRSystem:
    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()

    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq

    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i + 1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs

    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs

    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [
            idx
            for idx, title in sorted(enumerate(titles), key=lambda xx: xx[1])
        ]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]

    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query

    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        self.tf = defaultdict(Counter)  #'term-frequency'
        #tf[1]['winter']=2.0 which means that the word 'winter' has been mentioned twice in the second docoment

        #takes around '5' seconds
        inv_index = defaultdict(set)
        for i in range(len(self.docs)):
            for word in self.docs[i]:
                self.tf[i][word] += 1.
                inv_index[word].add(i)

        self.inv_index = inv_index

    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        return self.inv_index[word]

    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)

    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        out = self.get_posting(query[0])
        if len(query) > 1:
            for word in query[1:]:
                out = self.get_posting(word).intersection(out)
        return sorted(out)

    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)

    def compute_tfidf(self):
        print "Calculating tf-idf..."

        self.tfidf = defaultdict(Counter)
        self.doc_tfidf = defaultdict(float)  #used in 'cosine similarity'
        N = len(self.docs)  #number of whole documents

        for word in self.vocab:
            idf = math.log10(float(N) / len(self.get_posting(word)))
            for i in range(N):
                try:
                    self.tfidf[i][word] = (1. +
                                           math.log10(self.tf[i][word])) * idf
                    self.doc_tfidf[i] += self.tfidf[i][word]**2
                except ValueError:
                    self.tfidf[i][word] = 0.

    def get_tfidf(self, word, document):
        return self.tfidf[document][word]

    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)

    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for _ in range(len(self.docs))]

        query_tf = Counter(query)
        for word in query:
            query_weight = 1. + math.log10(query_tf[word])
            posting_set = self.get_posting(word)
            for d in posting_set:
                scores[d] += self.tfidf[d][word] * query_weight

        for d in range(len(self.docs)):
            scores[d] /= math.sqrt(self.doc_tfidf[d])

        # ------------------------------------------------------------------
        #Sort the 'scores'
        ranking = [
            idx for idx, sim in sorted(
                enumerate(scores), key=lambda xx: xx[1], reverse=True)
        ]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results

    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #46
0
class TextIndex:
    def __init__(self):
        self.index = defaultdict(list)
        self.p = PorterStemmer()
        
    '''get stop words from stopwords file'''
    def getStopWords(self, stopwordsFile):
        f = open(stopwordsFile, 'r')
        stopwords = [line.rstrip() for line in f]
        self.sw = dict.fromkeys(stopwords)
        f.close()

    '''Create an inverted index to store word-document pairs'''        
    def create(self, docList, dirPath, stopwordsFile):

        self.getStopWords(dirPath + stopwordsFile)
        
        for d in docList:
            file = open(dirPath + d)
            pos = 1
            docIndex={}
            for word in file.read().split():
                '''Remove the punctuation marks''' 
                key = word.lower().strip(".")
                if key not in self.sw:
                    '''Use the Porter Stemmer algorithm to stem words.'''
                    key = self.p.stem(key, 0, len(key) - 1)
                    try:
                        docIndex[key][1].append(pos)
                    except:
                        docIndex[key]=[d, array('I',[pos])]                    
                pos += 1

            '''Merge the document index with global index'''
            for docName, positions in docIndex.items():
                self.index[docName].append(positions)
        print(self.index)

    '''Get the query type''' 
    def getQueryType(self, query):
        if '"' in query:
            return 'PQ' 
        elif (len(query.split()) > 1):
            return 'FTQ' 
        else:
            return 'OWQ'
        
    '''Query the Index created above'''
    def queryIndex(self):
        while True:
            q = sys.stdin.readline()
            q = q.rstrip()
            if q == '':
                break

            queryType = self.getQueryType(q)
            if queryType == 'OWQ':
                self.oneWordQuery(q)
            elif queryType == 'FTQ':
                self.freeTextQuery(q)

    '''One Word Query'''
    def oneWordQuery(self, q):
        originalQuery = q
        q = self.p.stem(q, 0, len(q) - 1)

        if len(q) == 0:
            print('Length of q is zero')
            return

        q = "'{}'".format(q)

        print(q)

        '''Query contains only one word'''
        if q not in self.index.keys():
            print('q is not in index')
            return
        else:
            pos = self.index[q]
            pos = [x[0] for x in pos]
            pos = ' '.join(pos)
            print(pos)

    '''Extract words from the free text query '''
    def getTerms(self, line):
        line = line.lower()
        '''replace non alphanumeric characters with space'''
        line = re.sub(r'[^a-z0-9 ]',' ',line)

        line = line.split()
        line = [x for x in line if x not in self.sw]
        line = [self.p.stem(word, 0, len(word) -1) for word in line]
        return line

    '''This function returns the intersection of lists'''
    def intersectsLists(self, lists):
        if len(lists) == 0:
            return []

        '''Sort the list on the basis of length such that smallest item appears first'''
        lists.sort(key=len)
        return list(reduce(lambda x, y: set(x) & set(y), lists))

    def getPostings(self, terms):
        '''all terms in the list are guaranteed to be in the index'''
        return [self.index[term] for term in terms]

    def getDocsFromPostings(self, postings):
        '''no empty list in postings'''
        return [[x[0] for x in p] for p in postings]

    '''Free Text Query'''
    def freeTextQuery(self, q):
        q = self.getTerms(q)

        if len(q)==0:
            print('')
            return

        li = set()
        for term in q:
            try:
                p=self.index[term]
                p=[x[0] for x in p]
                li=li|set(p)
            except:
                #term not in index
                pass

        li = list(li)
        li.sort()
        print(' '.join(li))

    '''Phrase Query'''
    def phraseQuery(self, q):
        originalQuery=q
        q = self.getTerms(q)
        if len(q) == 0:
            print('')
            return
        elif len(q) == 1:
            self.owq(originalQuery)
            return

        phraseDocs = self.phraseQueryDocs(q)

        print(' '.join(map(str, phraseDocs)))



    def phraseQueryDocs(self, termList):
        phraseDocs = []
        length = len(termList)

        '''first find matching docs'''
        for term in termList:
            if term not in self.index:
                '''if a term doesn't appear in the index there can't be any document matching it'''
                return []

        postings = self.getPostings(termList)
        docs = self.getDocsFromPostings(postings)

        '''docs are the documents that contain every term in the query'''
        docs = self.intersectLists(docs)
        '''postings are the postings list of the terms in the documents docs only'''
Пример #47
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]


    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # TODO: Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        #       NOTE that you probably do *not* want to store a value for every
        #       word-document pair, but rather just for those pairs where a
        #       word actually occurs in the document.

        print "Calculating tf-idf..."
        self.tfidf = {}
        
        # initialized
        for word in self.vocab:
            for d in range(len(self.docs)):
                if word not in self.tfidf:
                    self.tfidf[word] = {}
                self.tfidf[word][d] = 0.0
        
        N = len(self.docs)
        for word in self.vocab:
            indices = self.inv_index[word]
            for i in indices:                
                tf = 1 + math.log10(indices[i])
                idf = math.log10(N*1.0 / len(self.get_posting(word)))
                self.tfidf[word][i] = tf * idf
        
        #print self.tfidf
        # ------------------------------------------------------------------


    def get_tfidf(self, word, document):
        # ------------------------------------------------------------------
        # TODO: Return the tf-idf weigthing for the given word (string) and
        #       document index.
        tfidf = 0.0
        
        if word in self.tfidf:
            tfidf = self.tfidf[word][document]
        
        # ------------------------------------------------------------------
        return tfidf


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        # ------------------------------------------------------------------
        # TODO: Create an inverted index.
        #       Granted this may not be a linked list as in a proper
        #       implementation.
        #       Some helpful instance variables:
        #         * self.docs = List of documents
        #         * self.titles = List of titles

        # Example: inv_index['separ'] = {54: 3}  in doc id 54, occurs 3 times!
        
        inv_index = {}
                        
        for word in self.vocab:
            inv_index[word] = {}

        numdocs = len(self.docs)
        
        for d in xrange(0, numdocs):
            doc = self.docs[d]
            for word in doc:
                #if word == "zulu":
                #    print "zulu", inv_index[word]
                    
                if d in inv_index[word]:
                    inv_index[word][d] = inv_index[word][d]+1
                else:                    
                    inv_index[word][d] = 1


        #print inv_index['separ']
        #print "zulu inverted index", inv_index['zulu']
        #print inv_index
        self.inv_index = inv_index

        # ------------------------------------------------------------------


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        # ------------------------------------------------------------------
        # TODO: return the list of postings for a word.
        posting = []
        
        for i in self.inv_index[word]:
            posting.append(i)
            
        posting.sort()
        
        #if word == "zulu":
        #    print "posting for word", word , posting
        
        return posting
        # ------------------------------------------------------------------


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        # ------------------------------------------------------------------
        # TODO: Implement Boolean retrieval. You will want to use your
        #       inverted index that you created in index().
        # Right now this just returns all the possible documents!
        qsets = {}
        for qword in query:
            qsets[qword] = set()
            
            if qword in self.inv_index:
                for i in self.inv_index[qword]:
                    qsets[qword].add(i)
                    
        #for qword in qsets:
        #    print "word", qword, "set",  qsets[qword] 
            
        # initial set
        final = qsets[query[0]]
        for x in range(1, len(query)):
            final = final.intersection(qsets[query[x]])
        
        #print "final set ",  final
        
        docs = list(final)

        # ------------------------------------------------------------------

        return sorted(docs)   # sorted doesn't actually matter


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for xx in range(len(self.docs))]
        # ------------------------------------------------------------------
        # TODO: Implement cosine similarity between a document and a list of
        #       query words.

        # Right now, this code simply gets the score by taking the Jaccard
        # similarity between the query and every document.
        
        tf = {}        
        
        words_in_query = set()
        for word in query:
            words_in_query.add(word)
            
            if word not in tf:
                tf[word] = 1
            else:
                tf[word] = tf[word]+1
        
        
        #print query, tf

        for d, doc in enumerate(self.docs):
            words_in_doc = set(doc)
            #scores[d] = len(words_in_query.intersection(words_in_doc)) \
            #        / float(len(words_in_query.union(words_in_doc)))
                    
            union = words_in_query.union(words_in_doc)
            #inter = words_in_query.intersection(words_in_doc)
            
#            ltclnn = {}
#            
#            for w in union:
#                ltclnn[w] = {}
#                ltclnn[w]["dn"] = 0
#                ltclnn[w]["qn"] = 0
#                if w in tf:
#                    ltclnn[w]["qwt"] = 1+ math.log10(tf[w])
#                    ltclnn[w]["qn"] = ltclnn[w]["qn"] + ltclnn[w]["qwt"]**2
#                else:
#                    ltclnn[w]["qwt"] = 0
#                    ltclnn[w]["qn"] = 0
#                    
#                ltclnn[w]["dwt"] = self.get_tfidf(w, d)
#                ltclnn[w]["dn"] = ltclnn[w]["dn"] + ltclnn[w]["dwt"]**2
#                
#            for w in ltclnn:
#                ltclnn[w]["qwtn"] = ltclnn[w]["qwt"] / math.sqrt(ltclnn[w]["qn"])
#                ltclnn[w]["dwtn"] = ltclnn[w]["dwt"] / math.sqrt(ltclnn[w]["dn"])
#            
#            prod = 0
#            for w in ltclnn:
#                prod = prod + ltclnn[w]["qwtn"] * ltclnn[w]["dwtn"]
#            
#            scores[d] = prod            
            
            ltc_sum = 0
            #lnn_sum = 0
            ltc_lnn = 0
            for term in union:                
                                
                ltc = self.get_tfidf(term, d)
                
                ltc_sum = ltc_sum + ltc*ltc
                
                if term in tf:
                    lnn = 1 + math.log10(tf[term])
                else:
                    lnn = 0
                    
                #lnn_sum = lnn_sum + lnn*lnn
                ltc_lnn = ltc_lnn + ltc*lnn
            
            scores[d] = ltc_lnn / math.sqrt(ltc_sum)
                
        
        #print scores

        # ------------------------------------------------------------------

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #48
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]


    def compute_tfidf(self):
        print "Calculating tf-idf..."
        self.tfidf = {}
        n = float(len(self.docs))
                    
        for word in self.vocab:
            df  = len(self.inv_index[word])
            idf = math.log(n/df, 10)
            
            for d in range(len(self.docs)):
                if d not in self.tfidf:
                    self.tfidf[d] = {}
                tf = len(self.inv_index[word].get(d, []))
                if tf == 0:
                    self.tfidf[d][word] = 0.0
                else:
                    self.tfidf[d][word] = (1 + math.log(tf, 10)) * idf


    def get_tfidf(self, word, document):
        return self.tfidf[document][word]


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        
        inv_index = {}
        for i in range(len(self.docs)):
            for j in range(len(self.docs[i])):
                word = self.docs[i][j]
                if word not in inv_index:
                    inv_index[word] = {}
                if i not in inv_index[word]:
                    inv_index[word][i] = []
                inv_index[word][i].append(j)

        self.inv_index = inv_index


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        posting = self.inv_index.get(word).keys()
        return sorted(posting)
        # ------------------------------------------------------------------


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        docs = range(len(self.docs))
        for word in query:
            docs = list(set(self.get_posting(word)) & set(docs))

        return sorted(docs)   # sorted doesn't actually matter


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        scores = [0.0 for xx in range(len(self.docs))]
        
        query_vector = {}
        for word in self.vocab:
            tf = query.count(word)
            if tf == 0:
                query_vector[word] = 0.0
            else:
                query_vector[word] = 1 + math.log(tf, 10)
        
        for d in range(len(self.docs)):
            doc_vector = self.tfidf[d]
            m1 = 0.0
            m2 = 0.0
            dp = 0.0
            for word in query_vector:
                m1 += math.pow(query_vector[word], 2)
                m2 += math.pow(doc_vector[word], 2)
                dp += query_vector[word] * doc_vector[word]
            
            scores[d] = dp / math.sqrt(m2)

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #49
0
def stemAndRemoveFrequence(text):
	p = PorterStemmer()
	words = []
	for word in text:
		words.append( p.stem(word, 0, len(word)-1) )
	return set(words)
Пример #50
0
    if line['kind'] == 'meeting':
        course['instructors'] = 

for course in courses:        
        doc_vect = {}
        terms = set()
        doc_vector.append(doc_vect)
        titles_vector.append(course['code'] + "   " + course['title'])

        title_word_vect = course['title'].split(" ")
        descp_word_vect = str(course['description']).split(' ')
        
        prev = ""
        for title_word in title_word_vect:
            title_word = title_word.lower()
            title_word = p.stem(title_word, 0, len(title_word)-1)
            if title_word not in doc_vect:
                doc_vect[title_word] = TITLE
            else:
                doc_vect[title_word] += TITLE

            if title_word not in corp_freq_hash:
                corp_freq_hash[title_word] = 1
            else:
                corp_freq_hash[title_word] += 1        
            terms.add(title_word)

            if prev:
                bigram = prev+" "+title_word
                if bigram not in doc_vect:
                    doc_vect[bigram] = TITLE
Пример #51
0
def stemWords(l):
    ps = PorterStemmer()
    return [ps.stem(x, 0, len(x) - 1) for x in l]
Пример #52
0
def stem_words(l):
    ps = PorterStemmer()
    return [ps.stem(x, 0, len(x) - 1) for x in l]
Пример #53
0
 def stemming(self, listOfWords):
     p = PorterStemmer()
     stemList = []
     for word in listOfWords:
         stemList.append(p.stem(word, 0, len(word) - 1))
     return stemList
Пример #54
0
def stem(word):
	p = PorterStemmer()
	return p.stem(word,0,len(word)-1).encode('utf8')
Пример #55
0
class IRSystem:
    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()

    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq

    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i + 1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)
        return titles, docs

    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs

    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [
            idx
            for idx, title in sorted(enumerate(titles), key=lambda xx: xx[1])
        ]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]

    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # TODO: Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        #       NOTE that you probably do *not* want to store a value for every
        #       word-document pair, but rather just for those pairs where a
        #       word actually occurs in the document.
        print "Calculating tf-idf..."
        self.tfidf = {}
        self.tf = collections.defaultdict(
            lambda: collections.defaultdict(lambda: 0))

        for docid, doc in enumerate(self.docs):
            for word in doc:
                self.tf[docid][word] += 1

        for word in self.vocab:
            if word not in self.tfidf:
                self.tfidf[word] = {}
            for d in range(len(self.docs)):
                if self.tf[d][word] > 0:
                    tf = 1 + math.log10(self.tf[d][word])
                    df = len(self.inv_index[word])
                    idf = math.log10(len(self.docs) * 1.0 / df)
                    self.tfidf[word][d] = tf * idf
        # ------------------------------------------------------------------

    def get_tfidf(self, word, document):
        # ------------------------------------------------------------------
        # TODO: Return the tf-idf weigthing for the given word (string) and
        #       document index.
        #tfidf = 0.0
        # ------------------------------------------------------------------
        if self.tfidf[word].get(document, 0) == 0:
            tfidf = 0.0
        else:
            tfidf = self.tfidf[word][document]
        return tfidf

    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)

    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        # ------------------------------------------------------------------
        # TODO: Create an inverted index.
        #       Granted this may not be a linked list as in a proper
        #       implementation.
        #       Some helpful instance variables:
        #         * self.docs = List of documents
        #         * self.titles = List of titles

        self.inv_index = collections.defaultdict(set)
        for docid, doc in enumerate(self.docs):
            for word in doc:
                self.inv_index[word].add(docid)

        # ------------------------------------------------------------------

    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        # ------------------------------------------------------------------
        # TODO: return the list of postings for a word.
        posting = self.inv_index[word]

        return posting
        # ------------------------------------------------------------------

    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)

    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """
        # ------------------------------------------------------------------
        # TODO: Implement Boolean retrieval. You will want to use your
        #       inverted index that you created in index().
        # Right now this just returns all the possible documents!
        docSet = set()
        for term in query:
            if len(docSet) == 0:
                docSet = self.get_posting(term)
            else:
                docSet &= self.get_posting(term)
        # ------------------------------------------------------------------

        return sorted(list(docSet))  # sorted doesn't actually matter

    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        # ------------------------------------------------------------------
        # TODO: Implement cosine similarity between a document and a list of
        #       query words.

        # Right now, this code simply gets the score by taking the Jaccard
        # similarity between the query and every document.
        """
        words_in_query = set()
        for word in query:
            words_in_query.add(word)

        for d, doc in enumerate(self.docs):
            words_in_doc = set(doc)
            scores[d] = len(words_in_query.intersection(words_in_doc)) \
                    / float(len(words_in_query.union(words_in_doc)))
        """
        # ------------------------------------------------------------------
        scores = collections.defaultdict(lambda: 0.0)

        for d in range(len(self.docs)):
            numerator = 0.0
            denominator = 0.0
            for term in query:
                count = 0.0
                for q in query:
                    if term == q:
                        count += 1
                numerator += (1 + math.log10(count)) * (self.get_tfidf(
                    term, d))
            for tt in set(self.docs[d]):
                denominator += self.get_tfidf(tt, d)**2
            denominator = denominator**(1.0 / 2)
            scores[d] = numerator / denominator
        temp = []
        for idx in range(len(self.docs)):
            temp.append(scores[idx])

        ranking = [
            idx for idx, sim in sorted(
                enumerate(temp), key=lambda xx: xx[1], reverse=True)
        ]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results

    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query

    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)

    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #56
0
class IRSystem:

    def __init__(self):
        # For holding the data - initialized in read_data()
        self.titles = []
        self.docs = []
        self.vocab = []
        # For the text pre-processing.
        self.alphanum = re.compile('[^a-zA-Z0-9]')
        self.p = PorterStemmer()


    def get_uniq_words(self):
        uniq = set()
        for doc in self.docs:
            for word in doc:
                uniq.add(word)
        return uniq


    def __read_raw_data(self, dirname):
        print "Stemming Documents..."

        titles = []
        docs = []
        os.mkdir('%s/stemmed' % dirname)
        title_pattern = re.compile('(.*) \d+\.txt')

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/raw' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        for i, filename in enumerate(filenames):
            title = title_pattern.search(filename).group(1)
            print "    Doc %d of %d: %s" % (i+1, len(filenames), title)
            titles.append(title)
            contents = []
            f = open('%s/raw/%s' % (dirname, filename), 'r')
            of = open('%s/stemmed/%s.txt' % (dirname, title), 'w')
            for line in f:
                # make sure everything is lower case
                line = line.lower()
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # remove non alphanumeric characters
                line = [self.alphanum.sub('', xx) for xx in line]
                # remove any words that are now empty
                line = [xx for xx in line if xx != '']
                # stem words
                line = [self.p.stem(xx) for xx in line]
                # add to the document's conents
                contents.extend(line)
                if len(line) > 0:
                    of.write(" ".join(line))
                    of.write('\n')
            f.close()
            of.close()
            docs.append(contents)

        return titles, docs


    def __read_stemmed_data(self, dirname):
        print "Already stemmed!"
        titles = []
        docs = []

        # make sure we're only getting the files we actually want
        filenames = []
        for filename in os.listdir('%s/stemmed' % dirname):
            if filename.endswith(".txt") and not filename.startswith("."):
                filenames.append(filename)

        if len(filenames) != 60:
            msg = "There are not 60 documents in ../data/RiderHaggard/stemmed/\n"
            msg += "Remove ../data/RiderHaggard/stemmed/ directory and re-run."
            raise Exception(msg)

        for i, filename in enumerate(filenames):
            title = filename.split('.')[0]
            titles.append(title)
            contents = []
            f = open('%s/stemmed/%s' % (dirname, filename), 'r')
            for line in f:
                # split on whitespace
                line = [xx.strip() for xx in line.split()]
                # add to the document's conents
                contents.extend(line)
            f.close()
            docs.append(contents)

        return titles, docs


    def read_data(self, dirname):
        """
        Given the location of the 'data' directory, reads in the documents to
        be indexed.
        """
        # NOTE: We cache stemmed documents for speed
        #       (i.e. write to files in new 'stemmed/' dir).

        print "Reading in documents..."
        # dict mapping file names to list of "words" (tokens)
        filenames = os.listdir(dirname)
        subdirs = os.listdir(dirname)
        if 'stemmed' in subdirs:
            titles, docs = self.__read_stemmed_data(dirname)
        else:
            titles, docs = self.__read_raw_data(dirname)

        # Sort document alphabetically by title to ensure we have the proper
        # document indices when referring to them.
        ordering = [idx for idx, title in sorted(enumerate(titles),
            key = lambda xx : xx[1])]

        self.titles = []
        self.docs = []
        numdocs = len(docs)
        for d in range(numdocs):
            self.titles.append(titles[ordering[d]])
            self.docs.append(docs[ordering[d]])

        # Get the vocabulary.
        self.vocab = [xx for xx in self.get_uniq_words()]

        # My additions - sorted documents
        self.sorted_docs = [sorted(doc) for doc in self.docs]
        self.reverse_sorted_docs = [doc[:] for doc in self.sorted_docs]
        map(list.reverse, self.reverse_sorted_docs)

    def compute_idf(self, word):
        """
        Computes the idf of the given word. Needs the inv_index set up - do this by calling self.index
        """
        N = len(self.docs)
        df = len(self.inv_index[word])
        return math.log(N*1.0/df,10)

    def compute_tfidf(self):
        # -------------------------------------------------------------------
        # TODO: Compute and store TF-IDF values for words and documents.
        #       Recall that you can make use of:
        #         * self.vocab: a list of all distinct (stemmed) words
        #         * self.docs: a list of lists, where the i-th document is
        #                   self.docs[i] => ['word1', 'word2', ..., 'wordN']
        #       NOTE that you probably do *not* want to store a value for every
        #       word-document pair, but rather just for those pairs where a
        #       word actually occurs in the document.
        print "Calculating tf-idf..."
        self.tfidf = {}

        for doc_id, doc in enumerate(self.docs):

            word_counter = Counter(self.docs[doc_id])
            for word in word_counter.keys():
                if word not in self.tfidf:
                    self.tfidf[word] = {}

                self.tfidf[word][doc_id] = 1 + math.log(word_counter[word], 10)

        for word in self.tfidf:
            idf = self.compute_idf(word)

            for doc_id in self.tfidf[word]:
                self.tfidf[word][doc_id] *= idf


    def get_tfidf(self, word, document):
        return self.tfidf[word][document]


    def get_tfidf_unstemmed(self, word, document):
        """
        This function gets the TF-IDF of an *unstemmed* word in a document.
        Stems the word and then calls get_tfidf. You should *not* need to
        change this interface, but it is necessary for submission.
        """
        word = self.p.stem(word)
        return self.get_tfidf(word, document)


    def index(self):
        """
        Build an index of the documents.
        """
        print "Indexing..."
        # ------------------------------------------------------------------
        # TODO: Create an inverted index.
        #       Granted this may not be a linked list as in a proper
        #       implementation.
        #       Some helpful instance variables:
        #         * self.docs = List of documents
        #         * self.titles = List of titles

        inv_index = {}
        for word in self.vocab:
            inv_index[word] = []

        for doc_id, doc in enumerate(self.docs):
            for word in sorted(list(set(doc))):
                inv_index[word].append(doc_id)

        
        self.inv_index = inv_index

        # ------------------------------------------------------------------


    def get_posting(self, word):
        """
        Given a word, this returns the list of document indices (sorted) in
        which the word occurs.
        """
        return self.inv_index[word]


    def get_posting_unstemmed(self, word):
        """
        Given a word, this *stems* the word and then calls get_posting on the
        stemmed word to get its postings list. You should *not* need to change
        this function. It is needed for submission.
        """
        word = self.p.stem(word)
        return self.get_posting(word)


    def boolean_retrieve(self, query):
        """
        Given a query in the form of a list of *stemmed* words, this returns
        the list of documents in which *all* of those words occur (ie an AND
        query).
        Return an empty list if the query does not return any documents.
        """

        postings = map(set,map(self.get_posting, query))
        matching_doc_ids = set(range(len(self.docs))).intersection(*postings)

        return sorted(matching_doc_ids)

    def calculate_score_by_jaccard(self, query):
        scores = [0.0 for xx in range(len(self.docs))]
        
        words_in_query = set(query)

        for d, doc in enumerate(self.docs):
            words_in_doc = set(doc)
            scores[d] = len(words_in_query.intersection(words_in_doc)) / float(len(words_in_query.union(words_in_doc)))

        return scores

    def normalized_length(self, doc_id):
        """
        Normalizes length of the given document: sums the squared TFIDF score of the 
        SET (meaning unique) of all words in the document and returns the square root of the sum.
        """
        word_wtf_in_doc = [self.tfidf[word][doc_id] for word in set(self.docs[doc_id])]
        return sum(map(lambda x: x**2, word_wtf_in_doc)) ** 0.5

    def calculate_cosine_scores(self, query):
        scores = [0.0 for xx in range(len(self.docs))]
        lengths = [self.normalized_length(doc_id) for doc_id in range(len(self.docs))]

        for term in set(query):
            query_weighted_term_frequency = 1 + math.log(query.count(term), 10)
            for doc_id in self.get_posting(term):
                scores[doc_id] += self.tfidf[term][doc_id] * query_weighted_term_frequency

        scores = [score/length for score,length in zip(scores,lengths)]
        return scores


    def rank_retrieve(self, query):
        """
        Given a query (a list of words), return a rank-ordered list of
        documents (by ID) and score for the query.
        """
        #scores = self.calculate_score_by_jaccard(query)
        scores = self.calculate_cosine_scores(query)

        ranking = [idx for idx, sim in sorted(enumerate(scores),
            key = lambda xx : xx[1], reverse = True)]
        results = []
        for i in range(10):
            results.append((ranking[i], scores[ranking[i]]))
        return results


    def process_query(self, query_str):
        """
        Given a query string, process it and return the list of lowercase,
        alphanumeric, stemmed words in the string.
        """
        # make sure everything is lower case
        query = query_str.lower()
        # split on whitespace
        query = query.split()
        # remove non alphanumeric characters
        query = [self.alphanum.sub('', xx) for xx in query]
        # stem words
        query = [self.p.stem(xx) for xx in query]
        return query


    def query_retrieve(self, query_str):
        """
        Given a string, process and then return the list of matching documents
        found by boolean_retrieve().
        """
        query = self.process_query(query_str)
        return self.boolean_retrieve(query)


    def query_rank(self, query_str):
        """
        Given a string, process and then return the list of the top matching
        documents, rank-ordered.
        """
        query = self.process_query(query_str)
        return self.rank_retrieve(query)
Пример #57
0
class Chatbot:
    """Simple class to implement the chatbot for PA 6."""
    def __init__(self, creative=False):
        # The chatbot's default name is `moviebot`. Give your chatbot a new name.
        self.name = 'moviebot'

        self.creative = creative

        # This matrix has the following shape: num_movies x num_users
        # The values stored in each row i and column j is the rating for
        # movie i by user j
        self.titles, ratings = movielens.ratings()
        self.sentiment = movielens.sentiment()
        self.new_sentiment = {}
        self.p = PorterStemmer()

        # create a new sentiment dict with stemmed keys
        for key in self.sentiment:
            new_key = self.p.stem(key)
            self.new_sentiment[new_key] = self.sentiment[key]

        self.bin_ratings = self.binarize(ratings)

        # a tuple with the sentiment of the movie being discussed
        self.current_sentiment = None
        # the movie title entered by the user
        self.current_title = None
        # a list of current movie candidates
        self.current_idxs = []

        self.prev_movie = None
        self.prev_sentiment = None

        # a dict where dict[i] = j is the user's sentiment j for movie index i
        # for movies that the user has described and the chatbot has processed
        self.user_movies = {}

        # a set of movie indexes that the user has already described
        self.user_movie_set = set()

        self.prefix_match_found = False
        self.disambig = False

        # if chatbot is in recommend mode, only respond to yes or no
        self.recommend_mode = False

        # a list of recommendations for the user
        self.recommendations = []
        self.recommend_idx = 0

        # preprocess movie list by extracting possible titles and year
        self.movies = []
        for entry in self.titles:
            self.movies.append(extract_titles_and_year(entry[0]))
        #############################################################################
        # TODO: Binarize the movie ratings matrix.                                  #
        #############################################################################

        # Binarize the movie ratings before storing the binarized matrix.
        self.ratings = ratings
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################

    #############################################################################
    # 1. WARM UP REPL                                                           #
    #############################################################################

    def greeting(self):
        """Return a message that the chatbot uses to greet the user."""
        #############################################################################
        # TODO: Write a short greeting message                                      #
        #############################################################################

        greeting_message = "Hi there! I'm Movie Chatbot. How can I help you?"

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return greeting_message

    def goodbye(self):
        """Return a message that the chatbot uses to bid farewell to the user."""
        #############################################################################
        # TODO: Write a short farewell message                                      #
        #############################################################################

        goodbye_message = "Have a nice day! It was fun talking to you!"

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return goodbye_message

    ###############################################################################
    # 2. Modules 2 and 3: extraction and transformation                           #
    ###############################################################################

    def process(self, line):
        """Process a line of input from the REPL and generate a response.

      This is the method that is called by the REPL loop directly with user input.

      You should delegate most of the work of processing the user's input to
      the helper functions you write later in this class.

      Takes the input string from the REPL and call delegated functions that
        1) extract the relevant information, and
        2) transform the information into a response to the user.

      Example:
        resp = chatbot.process('I loved "The Notebok" so much!!')
        print(resp) // prints 'So you loved "The Notebook", huh?'

      :param line: a user-supplied line of text
      :returns: a string containing the chatbot's response to the user input
      """
        #############################################################################
        # TODO: Implement the extraction and transformation in this method,         #
        # possibly calling other functions. Although modular code is not graded,    #
        # it is highly recommended.                                                 #
        #############################################################################
        response = ''

        swear_response = self.checkSwearWords(line)
        if swear_response:
            return swear_response
        caps_lock_response = self.checkAnger(line)
        if caps_lock_response:
            return caps_lock_response

        if self.recommend_mode:
            if re.match('yes', line.strip(), re.I):
                return self.give_recommendation()
            elif re.match('no', line.strip(), re.I):
                return "Okay, I guess I've given you enough recommendations!"
            else:
                return "Let's talk about that later. Do you want another recommendation?"

        clarification = False
        if self.creative:
            # deal with "Can you...?", "What is...?", etc. questions
            response_to_question = self.matches_question(line)
            if response_to_question:
                return response_to_question
            elif self.disambig:
                self.current_idxs = self.disambiguate(line, self.current_idxs)
                #print(self.current_idxs)
                if len(self.current_idxs) == 1:
                    self.current_title = self.titles[self.current_idxs[0]][0]
                    self.disambig = False
                    clarification = True
                else:
                    response = "Sorry, can you be a little more specific? I still found the following movies:\n"
                    for i in self.current_idxs:
                        response += "{}\n".format(self.titles[i][0])
                    return response

        # extract titles and matches
        extracted_title_from_current_line = False
        if not self.current_title:
            matches = self.get_possible_matching_titles(line)
            extracted_title_from_current_line = True
            #print('Extracted title')
        else:
            matches = [(self.current_title, self.current_idxs)]
        #print('Current title:{}'.format(self.current_title))
        # extract sentiment
        extracted_sentiment_from_current_line = False
        if not self.current_sentiment:
            # remove title from line for sentiment extraction
            if matches:
                line = line.replace(matches[0][0], '')
            sentiment = self.extract_sentiment(line)
            extracted_sentiment_from_current_line = True
            self.current_sentiment = sentiment
            #print('Extracted sentiment')
        else:
            sentiment = self.current_sentiment
        #print('Current sentiment:{}'.format(self.current_sentiment))

        if self.creative:
            if not extracted_title_from_current_line and \
               extracted_sentiment_from_current_line:
                if not clarification and not contains_anaphoric_expression(
                        line):
                    #print('no anaphoric expression')
                    return self.generate_response_to_irrelevant_input()

        if self.creative:
            if len(matches) == 0 and not self.current_title:
                return self.generate_response_to_irrelevant_input()
            elif len(matches) > 1:
                return 'Please tell me about one movie at a time.'
            elif len(matches) == 1:
                title, idxs = matches[0]
                self.current_idxs = idxs
                self.current_title = title
                if len(idxs) == 0:
                    self.clear_current_movie()
                    return "Hmm, I couldn't find a match for \"{}\". Please tell me about some other movies you have watched!".format(
                        title)
                elif len(idxs) == 1:
                    if idxs[0] in self.user_movie_set:
                        response = "(I think you already told me about that movie, but I'll update what you tell me!)\n"

                    if sentiment == 0:
                        return response + "I'm a little confused. What did you think about \"{}\"?".format(
                            self.titles[idxs[0]][0])
                    if sentiment == 1:
                        response += "Great, so you liked \"{}\".".format(
                            self.titles[idxs[0]][0])
                    elif sentiment == 2:
                        response += "Wow, you really loved \"{}\"!".format(
                            self.titles[idxs[0]][0])
                    elif sentiment == -1:
                        response += "Okay, you didn't like \"{}\".".format(
                            self.titles[idxs[0]][0])
                    elif sentiment == -2:
                        response += "It seems like you hated \"{}\" with a passion! That's too bad.".format(
                            self.titles[idxs[0]][0])
                    self.process_movie(idxs[0], sentiment)
                else:
                    response = "I found multiple movies. Which one are you talking about?\n"
                    for i in idxs:
                        response += '{}\n'.format(self.titles[i][0])
                    self.disambig = True
                    return response
        else:
            if len(matches) == 0:
                return self.generate_response_to_irrelevant_input()
            elif len(matches) > 1:
                return 'Please tell me about one movie at a time.'
            else:
                title, idxs = matches[0]
                sentiment = self.extract_sentiment(line.replace(title, ''))
                if sentiment == 0:
                    return "So did you like \"{}\" or hate it? Please tell me.".format(
                        self.titles[idxs[0]][0])
                else:
                    if len(idxs) > 1:
                        return "I found multiple matches for \"{}\". Can you be more specific? Maybe try telling me the year as well.".format(
                            title)
                    elif len(idxs) == 0:
                        return "Hmm, I couldn't find a match for \"{}\". Please tell me about some other movies you have watched!".format(
                            title)
                    else:
                        if sentiment > 0:
                            if idxs[0] in self.user_movie_set:
                                response = "(I think you already told me about that movie, but I'll update what you tell me!)\n"
                            else:
                                response = "Great! So you liked \"{}\". ".format(
                                    self.titles[idxs[0]][0])
                            self.process_movie(idxs[0], sentiment)
                        elif sentiment < 0:
                            if idxs[0] in self.user_movie_set:
                                response = "I think you already told me about that movie."
                            else:
                                response = "Okay, so you didn't like \"{}\". ".format(
                                    self.titles[idxs[0]][0])
                                self.process_movie(idxs[0], sentiment)
                        else:
                            return "I'm not sure if you liked or didn't the movie. Can you tell me a movie and what you thought about it?"

        # recommend once we have 5 movies
        if len(self.user_movies) >= 5:
            self.recommend_mode = True
            user_ratings = np.zeros(len(self.titles))
            for m in self.user_movies:
                user_ratings[m] = self.user_movies[m]
            self.recommendations = self.recommend(user_ratings,
                                                  self.bin_ratings,
                                                  k=10,
                                                  creative=self.creative)
            self.recommend_idx = 0
            return self.give_recommendation()
        else:
            response += " " + self.generate_request_for_more_movies()
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return response

    def give_recommendation(self):
        recommend_sentences = [
            "Why don't you check out \"{}\"? ",
            "I think you might enjoy \"{}\"! ",
            "\"{}\" might suit your tastes! "
        ]
        if self.recommend_idx < len(self.recommendations):
            response = ''
            if self.recommend_idx == 0:
                response += "Okay, based on what you told me, I think you would like \"{}\"! ".format(
                    self.titles[self.recommendations[self.recommend_idx]][0])
            else:
                response += random.choice(recommend_sentences).format(
                    self.titles[self.recommendations[self.recommend_idx]][0])
            response += 'Would you like another recommendation?'
            self.recommend_idx += 1
        else:
            response = "Sorry, I don't have any more recommendations!"

        return response

    def matches_question(self, text):
        '''
      Returns response to question
      '''
        question_responses = [
            "I don't know. Ask Google.", "I'd like to know as well.",
            "Let me think about that. I'll get back to you in a billion years."
        ]
        match = re.findall('(.*)\?', text, re.I)
        if match:
            return self.flip_question(text) + ' ' + random.choice(
                question_responses)
        else:
            return None

    def flip_question(self, text):
        '''
      Flips the perspective of the question
      '''
        table = {
            'I': 'you',
            'me': 'you',
            'my': 'your',
            'your': 'my',
            'myself': 'yourself',
            'yourself': 'myself'
        }
        # some common prepositions
        prep_set = {
            'of', 'with', 'at', 'from', 'including', 'until', 'against',
            'among', 'towards', 'upon', 'to'
        }
        words = re.split('\s|\?', text)
        words.pop()  # remove empty string at end
        last_word = None
        for i in range(len(words)):
            if words[i] in table:
                words[i] = table[words[i]]
            elif words[i] == 'you' or words[i] == 'You':
                if last_word in prep_set:
                    words[i] = 'me'
                else:
                    words[i] = 'I'
            last_word = words[i]
        return ' '.join(words) + '?'

    def generate_response_to_irrelevant_input(self):
        responses = [
            "I'm sorry, but I want to hear about a movie you liked.",
            "That's really cool and all, but can we go back to talking about movies? I want to know more about movies you enjoyed!",
            "Maybe we can talk about that later. Let's get back to talking about movies. Why don't you tell me what you thought about a movie you watched recently?"
        ]
        return random.choice(responses)

    def generate_request_for_more_movies(self):
        responses = [
            "Please tell me about more movies you've watched!",
            "Tell me another one of your favorite movies. This is so much fun!",
            "What is another movie you liked?"
        ]
        return random.choice(responses)

    def get_possible_matching_titles(self, line):
        possible_titles = self.extract_titles(line)
        matches = []
        if self.creative:
            self.prefix_match_found = False
            for title in possible_titles:
                movie_idxs = self.find_movies_by_title(title)
                #print(movie_idxs)
                if not self.prefix_match_found:
                    movie_idxs.extend(
                        self.find_movies_closest_to_title(title,
                                                          max_distance=3))
                    #print(movie_idxs)
                    movie_idxs = sorted(list(set(movie_idxs)))
                matches.append((title, movie_idxs))
        else:
            for title in possible_titles:
                matches.append((title, self.find_movies_by_title(title)))
        return matches

    def process_movie(self, movie_index, sentiment):
        self.user_movies[movie_index] = sentiment
        self.user_movie_set.add(movie_index)
        self.prev_idx = movie_index
        self.prev_sentiment = self.current_sentiment
        self.clear_current_movie()

    def clear_current_movie(self):
        self.current_sentiment = None
        self.current_title = None
        self.current_idxs = None

    def extract_titles(self, text):
        """Extract potential movie titles from a line of text.

      Given an input text, this method should return a list of movie titles
      that are potentially in the text.

      - If there are no movie titles in the text, return an empty list.
      - If there is exactly one movie title in the text, return a list
      containing just that one movie title.
      - If there are multiple movie titles in the text, return a list
      of all movie titles you've extracted from the text.

      Example:
        potential_titles = chatbot.extract_titles('I liked "The Notebook" a lot.')
        print(potential_titles) // prints ["The Notebook"]

      :param text: a user-supplied line of text that may contain movie titles
      :returns: list of movie titles that are potentially in the text
      """
        potential_titles = []
        if self.creative:
            pat1 = '"(.*?)"'
            stop_words = 'at|as|of|on|to|with|and|the|in|from|&|\+|by|or|de|vs\.'
            pat2 = '((?:[A-HJ-Z0-9]\S*(?:\s+(?:[A-Z0-9\.\-\(]\S*|' + stop_words + ')?|$)|I [A-Z0-9])(?:.*[A-HJ-Z0-9]\S*|.*[A-Z]\S+)?\s*(?:\(\d{4}\))?)'
            potential_titles = re.findall(pat1, text)
            potential_titles.extend(re.findall(pat2, text))
            potential_titles = list(set(potential_titles))
        else:
            potential_titles = re.findall('"(.*?)"', text)
        return potential_titles

    def find_movies_by_title(self, title):
        """ Given a movie title, return a list of indices of matching movies.

      - If no movies are found that match the given title, return an empty list.
      - If multiple movies are found that match the given title, return a list
      containing all of the indices of these matching movies.
      - If exactly one movie is found that matches the given title, return a list
      that contains the index of that matching movie.

      Example:
        ids = chatbot.find_movies_by_title('Titanic')
        print(ids) // prints [1359, 1953]

      :param title: a string containing a movie title
      :returns: a list of indices of matching movies
      """
        candidates = []
        if self.creative:
            movie = extract_titles_and_year(title)
            for i in range(len(self.movies)):
                match_found = False
                for dbt in self.movies[i].titles:
                    for qt in movie.titles:
                        # if database title starts with query title
                        if bool(re.match(qt + '($|\W)', dbt, re.I)):
                            match_found = True
                            break
                    if match_found:
                        break
                if match_found:
                    # if no year included in query, add all movies that match
                    if not movie.year:
                        candidates.append(i)
                        self.prefix_match_found = True
                    # if year included in query, add only movies that match both
                    # title AND year
                    if movie.year and movie.year == self.movies[i].year:
                        candidates.append(i)
                        self.prefix_match_found = True
        else:
            movie = extract_titles_and_year(title)
            for i in range(len(self.movies)):
                if set(movie.titles).intersection(set(self.movies[i].titles)):
                    if not movie.year:
                        candidates.append(i)
                    elif movie.year and movie.year == self.movies[i].year:
                        candidates.append(i)
                        return candidates
        return candidates

    class Example:
        """Represents a document with a label. klass is 'pos' or 'neg' by convention.
          words is a list of strings.
      """
        def __init__(self):
            self.klass = ''
            self.words = []

    def extract_sentiment(self, text):
        """Extract a sentiment rating from a line of text.

      You should return -1 if the sentiment of the text is negative, 0 if the
      sentiment of the text is neutral (no sentiment detected), or +1 if the
      sentiment of the text is positive.

      As an optional creative extension, return -2 if the sentiment of the text
      is super negative and +2 if the sentiment of the text is super positive.

      Example:
        sentiment = chatbot.extract_sentiment('I liked "The Titanic"')
        print(sentiment) // prints 1

      :param text: a user-supplied line of text
      :returns: a numerical value for the sentiment of the text
      """

        #process train data

        negationSet = {"n't", "never", "not", "no"}
        strongerSet = {
            "really", "very", "love", "hate", "terrible", "truly", "despise",
            "great", "fantastic", "amazing", "extremely", "horrible",
            "disgusting", "stunning", "adore"
        }
        punct = "\W+"

        newSet = set()
        for word in negationSet:
            newSet.add(self.p.stem(word))
        negationSet = newSet

        newSet = set()
        for word in strongerSet:
            newSet.add(self.p.stem(word))
        strongerSet = newSet

        textWords = nltk.word_tokenize(text)

        opp = False
        pos_num = 0
        neg_num = 0
        strength_val = 1
        num_sentiment_words = 0

        for word in textWords:
            word = self.p.stem(word)
            if word in negationSet:
                opp = True
                continue
            if re.match(punct, word):
                opp = False
                strength_val = 1
                continue
            if word in strongerSet:
                strength_val = 2
            if word in self.new_sentiment:
                if self.new_sentiment[word] == 'pos' and not opp:
                    pos_num += strength_val
                elif self.new_sentiment[word] == 'pos' and opp:
                    neg_num += strength_val
                elif self.new_sentiment[word] == 'neg' and not opp:
                    neg_num += strength_val
                else:
                    pos_num += strength_val
                num_sentiment_words += 1

        thresh = 0.25
        if num_sentiment_words == 0:
            sentiment = 0
        else:
            avg = (pos_num - neg_num) / float(num_sentiment_words)
            if avg > 1:
                sentiment = 2
            elif thresh < avg <= 1:
                sentiment = 1
            elif -thresh <= avg <= thresh:
                sentiment = 0
            elif -1 <= avg < -thresh:
                sentiment = -1
            else:
                sentiment = -2

            if not self.creative:
                if sentiment > 1:
                    sentiment = 1
                elif sentiment < -1:
                    sentiment = -1

        return sentiment

    def extract_sentiment_for_movies(self, text):
        """Creative Feature: Extracts the sentiments from a line of text
      that may contain multiple movies. Note that the sentiments toward
      the movies may be different.

      You should use the same sentiment values as extract_sentiment, described above.
      Hint: feel free to call previously defined functions to implement this.

      Example:
        sentiments = chatbot.extract_sentiment_for_text('I liked both "Titanic (1997)" and "Ex Machina".')
        print(sentiments) // prints [("Titanic (1997)", 1), ("Ex Machina", 1)]

      :param text: a user-supplied line of text
      :returns: a list of tuples, where the first item in the tuple is a movie title,
        and the second is the sentiment in the text toward that movie
      """
        pass

    def find_movies_closest_to_title(self, title, max_distance=3):
        """Creative Feature: Given a potentially misspelled movie title,
      return a list of the movies in the dataset whose titles have the least edit distance
      from the provided title, and with edit distance at most max_distance.

      - If no movies have titles within max_distance of the provided title, return an empty list.
      - Otherwise, if there's a movie closer in edit distance to the given title 
        than all other movies, return a 1-element list containing its index.
      - If there is a tie for closest movie, return a list with the indices of all movies
        tying for minimum edit distance to the given movie.

      Example:
        chatbot.find_movies_closest_to_title("Sleeping Beaty") # should return [1656]

      :param title: a potentially misspelled title
      :param max_distance: the maximum edit distance to search for
      :returns: a list of movie indices with titles closest to the given title and within edit distance max_distance
      """
        candidates = []
        movie = extract_titles_and_year(title)
        for i in range(len(self.movies)):
            match_found = False
            for dbt in self.movies[i].titles:
                for qt in movie.titles:
                    dist = edit_distance(qt, dbt)
                    if dist <= max_distance:
                        match_found = True
                        # if distance is smaller than all previous, discard previous
                        if dist < max_distance:
                            candidates = []
                            max_distance = dist
                        break
                if match_found:
                    break
            if match_found:
                if not movie.year:
                    candidates.append(i)
                if movie.year and movie.year == self.movies[i].year:
                    candidates.append(i)
                    return candidates
        return candidates

    def disambiguate(self, clarification, candidates):
        """Creative Feature: Given a list of movies that the user could be talking about 
      (represented as indices), and a string given by the user as clarification 
      (eg. in response to your bot saying "Which movie did you mean: Titanic (1953) 
      or Titanic (1997)?"), use the clarification to narrow down the list and return 
      a smaller list of candidates (hopefully just 1!)

      - If the clarification uniquely identifies one of the movies, this should return a 1-element
      list with the index of that movie.
      - If it's unclear which movie the user means by the clarification, it should return a list
      with the indices it could be referring to (to continue the disambiguation dialogue).

      Example:
        chatbot.disambiguate("1997", [1359, 2716]) should return [1359]
      
      :param clarification: user input intended to disambiguate between the given movies
      :param candidates: a list of movie indices
      :returns: a list of indices corresponding to the movies identified by the clarification
      """
        filtered_idxs = []
        for idx in candidates:
            if bool(
                    re.search('(\W|^)' + clarification + '(\W|$)',
                              self.titles[idx][0], re.I)):
                filtered_idxs.append(idx)
        # try looking for phrases like 'first one' or '2nd movie'
        if not filtered_idxs:
            if bool(re.search('(\W|^)(first|1st)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 1:
                filtered_idxs = [candidates[0]]
            elif bool(re.search('(\W|^)(second|2nd)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 2:
                filtered_idxs = [candidates[1]]
            elif bool(re.search('(\W|^)(third|3rd)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 3:
                filtered_idxs = [candidates[2]]
            elif bool(re.search('(\W|^)(fourth|4th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 4:
                filtered_idxs = [candidates[3]]
            elif bool(re.search('(\W|^)(fifth|5th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 5:
                filtered_idxs = [candidates[4]]
            elif bool(re.search('(\W|^)(sixth|6th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 6:
                filtered_idxs = [candidates[5]]
            elif bool(re.search('(\W|^)(seventh|7th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 7:
                filtered_idxs = [candidates[6]]
            elif bool(re.search('(\W|^)(eighth|8th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 8:
                filtered_idxs = [candidates[7]]
            elif bool(re.search('(\W|^)(ninth|9th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 9:
                filtered_idxs = [candidates[8]]
            elif bool(re.search('(\W|^)(tenth|10th)(\W|$)', clarification, re.I)) and \
               len(candidates) >= 10:
                filtered_idxs = [candidates[9]]

        if not filtered_idxs:
            return candidates
        else:
            return filtered_idxs

    #############################################################################
    # 3. Movie Recommendation helper functions                                  #
    #############################################################################

    def binarize(self, ratings, threshold=2.5, creative=False):
        """Return a binarized version of the given matrix.

      To binarize a matrix, replace all entries above the threshold with 1.
      and replace all entries at or below the threshold with a -1.

      Entries whose values are 0 represent null values and should remain at 0.

      :param x: a (num_movies x num_users) matrix of user ratings, from 0.5 to 5.0
      :param threshold: Numerical rating above which ratings are considered positive

      :returns: a binarized version of the movie-rating matrix
      """
        #############################################################################
        # TODO: Binarize the supplied ratings matrix.                               #
        #############################################################################

        # The starter code returns a new matrix shaped like ratings but full of zeros.
        if creative:
            high_thresh = 4
            low_thresh = 5 - high_thresh
            binarized_ratings = np.where(
                ratings >= high_thresh, 2.0, 0.0) + np.where(
                    (ratings > threshold) &
                    (ratings < high_thresh), 1.0, 0.0) + np.where(
                        (ratings <= threshold) &
                        (ratings > low_thresh), -1.0, 0.0) + np.where(
                            (ratings != 0.0) &
                            (ratings <= low_thresh), -2.0, 0.0)
        else:
            binarized_ratings = np.where(
                ratings > threshold, 1.0, 0.0) + np.where(
                    (ratings != 0.0) & (ratings <= threshold), -1.0, 0.0)

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return binarized_ratings

    def similarity(self, u, v):
        """Calculate the cosine similarity between two vectors.

      You may assume that the two arguments have the same shape.

      :param u: one vector, as a 1D numpy array
      :param v: another vector, as a 1D numpy array

      :returns: the cosine similarity between the two vectors
      """
        #############################################################################
        # TODO: Compute cosine similarity between the two vectors.
        #############################################################################
        u_norm = np.linalg.norm(u)
        v_norm = np.linalg.norm(v)
        dot_prod = np.dot(u, v)
        similarity = dot_prod
        if u_norm == 0.0 or v_norm == 0.0:
            return 0.0
        else:
            similarity = float(dot_prod) / (u_norm * v_norm)
        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return similarity

    def recommend(self, user_ratings, ratings_matrix, k=10, creative=False):
        """Generate a list of indices of movies to recommend using collaborative filtering.

      You should return a collection of `k` indices of movies recommendations.

      As a precondition, user_ratings and ratings_matrix are both binarized.

      Remember to exclude movies the user has already rated!

      :param user_ratings: a binarized 1D numpy array of the user's movie ratings
      :param ratings_matrix: a binarized 2D numpy matrix of all ratings, where
        `ratings_matrix[i, j]` is the rating for movie i by user j
      :param k: the number of recommendations to generate
      :param creative: whether the chatbot is in creative mode

      :returns: a list of k movie indices corresponding to movies in ratings_matrix,
        in descending order of recommendation
      """

        #######################################################################################
        # TODO: Implement a recommendation function that takes a vector user_ratings          #
        # and matrix ratings_matrix and outputs a list of movies recommended by the chatbot.  #
        #                                                                                     #
        # For starter mode, you should use item-item collaborative filtering                  #
        # with cosine similarity, no mean-centering, and no normalization of scores.          #
        #######################################################################################

        # Populate this list with k movie indices to recommend to the user.

        unseen_movies = np.where(user_ratings == 0)[0]
        seen_movies = np.where(user_ratings != 0)[0]

        ratings_unseen = []

        for i in unseen_movies:
            unseen_ratings = ratings_matrix[i, :]
            weights = []
            ratings = []
            for j in seen_movies:
                seen_ratings = ratings_matrix[j, :]
                weight = self.similarity(unseen_ratings, seen_ratings)
                weights.append(weight)
                ratings.append(user_ratings[j])
            estimated_rating = float(np.dot(weights, ratings))
            ratings_unseen.append([i, estimated_rating])

        ratings_unseen.sort(key=lambda x: x[1], reverse=True)

        recommendations = []

        for i in range(k):
            recommendations.append(ratings_unseen[i][0])

        #############################################################################
        #                             END OF YOUR CODE                              #
        #############################################################################
        return recommendations

    def checkAnger(self, string):
        response = ''

        words = string.split()
        upperCase = all([(word.isupper()) for word in words])

        upperCaseResponses = [
            "Any reason you are yelling at me?!",
            "Is your caps lock key stuck or something?",
            "It looks like you were busy capslocking >_>."
        ]
        if upperCase:
            return random.choice(upperCaseResponses)
        else:
            return ''

    def checkSwearWords(self, string):
        swearSet = {
            "f**k", "f*****g", "shit", "damn", "bitch", "crap", "piss", "dick",
            "c**k", "pussy", "asshole", "f*g", "bastard", "s**t", "douche",
            "bollocks", "arsehole", "bloody"
        }
        words = set(string.lower().split())
        if words & swearSet:
            return 'Wash your mouth with soap!'
        else:
            return ''

    #############################################################################
    # 4. Debug info                                                             #
    #############################################################################

    def debug(self, line):
        """Return debug information as a string for the line string from the REPL"""
        # Pass the debug information that you may think is important for your
        # evaluators
        debug_info = 'debug info'
        return debug_info

    #############################################################################
    # 5. Write a description for your chatbot here!                             #
    #############################################################################
    def intro(self):
        """Return a string to use as your chatbot's description for the user.

      Consider adding to this description any information about what your chatbot
      can do and how the user can interact with it.
      """
        return """
Пример #58
0
class Chatbot:
    """Simple class to implement the chatbot for PA 6."""
    def __init__(self, is_turbo=False):
        self.name = 'moviebot'
        self.is_turbo = is_turbo
        self.p = PorterStemmer()
        self.read_data()
        #   self.titles, self.ratings = ratings()
        self.binarize()
        self.RecommendationStrings = [
            "I think you should check out %s! ",
            "This movie will blow your mind: %s. ",
            "Watch %s. It will ruin all other movies for you. "
        ]

        self.ratedMovieList = {}
        self.userRatingVector = np.zeros(len(self.titles))
        self.recommendedMovies = []

        self.inTheMiddleOfSentimentAnalysis = False
        self.currentMovieForMoreInformation = ""

        self.TwoMoviesBoolean = False
        self.currentConjunction = ""
        self.sentimentOfPreviousMovie = 0
        self.check = {}
        self.distanceThreshold = 10
        self.confirm = False
        self.previousInput = ""

    def greeting(self):
        """chatbot greeting message"""

        HelloStrings = [
            "How can I help you?",
            "Hey there! It's so nice to meet you. I'd love to hear what you thought of a few movies!",
            "What's up? Tell me about some movies you've seen!"
        ]
        GoodbyeStrings = [
            "Have a nice day!", "I'm going to miss you.",
            "Am gonna be in my room crying until I see you again."
        ]

        greeting_message = random.choice(HelloStrings)

        return greeting_message

    def goodbye(self):
        """chatbot goodbye message"""
        GoodbyeStrings = [
            "Have a nice day!", "I'm going to miss you.",
            "Am gonna be in my room crying until I see you again."
        ]

        goodbye_message = random.choice(GoodbyeStrings)

        return goodbye_message

    def process(self, input):
        self.TwoMoviesBoolean = False
        self.sentimentOfPreviousMovie = 0
        """Takes the input string from the REPL and call delegated functions
      that
        1) extract the relevant information and
        2) transform the information into a response to the user
      """

        WrongFormatStrings = [
            "I'm sorry, is that the right format? Please make sure to include the name of the movie in quotation marks.",
            "Whoaaa, can you please make sure you use quotation marks?",
            "Quotation marks around the movie, buddy. Please and thank you."
        ]
        UnknownMovieStrings = [
            "I'm sorry, I've never heard about that movie! Please tell me about another one.",
            "Is that some random indie film? Never heard of it!",
            "Man, I really need to get back to the cinema. Never heard of that movie..."
        ]
        SameMovieStrings = [
            "Hey! You already told me about that movie. Tell me about a different one now.",
            "Come on man, pick a NEW movie!",
            "Have you only watched 1 movie in your entire life? Pick a new one, please"
        ]
        ConfirmationStrings = [
            "I think you were talking about %s. Am I right?",
            "It probably wouldn't hurt for you to brush up on your spelling a bit. Did you mean %s?",
            "C'mon now, you can spell better than that! Were you talking about %s?"
        ]
        ConfusedStrings = [
            "Hmmmm. Didn't quite get that one. Let's try again. Tell me about another movie!",
            "Well this is going nowhere fast. From the top, lets try a new one!",
            "Trying to keep me on my toes I see. How about we get back to some recommendations. Tell me about a movie you've seen!"
        ]
        WhatIsStrings = [
            "To be honest, I'm not sure I'd like to talk about %s. How about we get back to movies?",
            "As much as I'd love to talk about %s, I'm really here for the movies. Give me another one!",
            "Would you rather chat about %s or get some movie recommendations? That what I thought. Hit me with a movie!"
        ]
        CanYouStrings = [
            "I'm not big on talking about me. Lets focus on the movies.",
            "Can you?!? Back to the movies please.",
            "I really appreciate how you've taken an interest in learning about me but all I really want to talk about is what you think about movies. How about one more?"
        ]
        ArbitraryStrings = [
            "Ok, got it.",
            "Interesting. But not as interesting as movies. Let's get back to movie recommendations!",
            "Wow you have such a broad range of interesting topics for discussion. I'd really like to stick to movies though.",
            "Hmmmm very interesting. How about you let me know what you thought of another movie?"
        ]

        if len(input) == 0:
            return "It seems you meant to say something but forgot"

        if len(self.recommendedMovies) > 0:
            movieRec = self.recommend(self.userRatingVector).title()
            response = random.choice(
                self.RecommendationStrings
            ) % movieRec + "Enter any key to hear another recommendation. (Or enter :quit if you're done.)"
            return response

        if self.inTheMiddleOfSentimentAnalysis:
            self.inTheMiddleOfSentimentAnalysis = False
            response = self.addRating(self.currentMovieForMoreInformation,
                                      input)
            return response

        if self.confirm:
            self.confirm = False
            match = re.match("yep|yea|yes|y *$|Yep|Yea|Yes|Y *$", input)
            if match is None:
                return random.choice(ConfusedStrings)
            else:
                if self.currentMovieForMoreInformation in self.ratedMovieList:
                    return random.choice(SameMovieStrings)
                return self.addRating(self.currentMovieForMoreInformation,
                                      self.previousInput)

    #Explaining this regex - checks if there are articles, checks for the year, repeats it all twice

        matchDouble = re.match(
            '(.*)\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\"(.*) (and|or|but|yet|neither|either|so)\,* (.*)\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\"(.*)',
            input)
        if matchDouble is not None:
            if matchDouble.group(2):
                movie1Name = matchDouble.group(3) + ", " + matchDouble.group(2)
            else:
                movie1Name = matchDouble.group(3)

            if matchDouble.group(8):
                movie2Name = matchDouble.group(9) + ", " + matchDouble.group(8)
            else:
                movie2Name = matchDouble.group(9)

            movie1Name = movie1Name.lower()
            movie2Name = movie2Name.lower()

            if (movie1Name not in self.ratedMovieList) and (
                    movie2Name not in self.ratedMovieList):
                if (movie1Name in self.titlesOnly) and (movie2Name
                                                        in self.titlesOnly):

                    self.currentConjunction = matchDouble.group(6)
                    self.TwoMoviesBoolean = True

                    input1 = matchDouble.group(1) + " " + matchDouble.group(5)
                    input2 = matchDouble.group(7) + " " + matchDouble.group(11)

                    response1 = self.addRating(movie1Name, input1)
                    response2 = self.addRating(movie2Name, input2)

                    return (response1 + "\n" + response2)
                else:
                    response = random.choice(UnknownMovieStrings)
                    return response
            else:
                response = random.choice(SameMovieStrings)
                return response

        match = re.match('.*\"(The|A|An|El|La)? *([\w ]*)( \(.*\)*)*\".*',
                         input)
        if match is None:
            match = re.match('(?:I )?[^A-Z]*([A-Z].*)', input)
            if match is not None:
                matchSubstr = match.group(1).lower()
                splitSubStr = matchSubstr.split()
                movieName = ""
                for i in range(0, len(splitSubStr)):
                    movieName = movieName + " " + splitSubStr[i]
                    movieName = movieName.strip()
                    if movieName in self.titlesOnly:
                        input = self.removeTitle(movieName, input)
                        return self.addRating(movieName, input)
            if self.is_turbo:
                can_you = re.match("[Cc]an you (.*)", input)
                what_is = re.match("[Ww]hat is (.*)[\?.!]?", input)
                if can_you is not None:
                    return random.choice(CanYouStrings)
                if what_is is not None:
                    return random.choice(WhatIsStrings) % what_is.group(1)
                else:
                    return random.choice(ArbitraryStrings)
            return random.choice(WrongFormatStrings)

        if match is not None:
            if match.group(1):
                movieName = match.group(2) + ", " + match.group(1)
            else:
                movieName = match.group(2)
            movieName = movieName.lower()
            if movieName not in self.ratedMovieList:

                if movieName in self.titlesOnly:
                    input = self.removeTitle(movieName, input)
                    return self.addRating(movieName, input)
                else:
                    movieName = self.findPotentialMovie(movieName)
                    if movieName is None:
                        return random.choice(UnknownMovieStrings)
                    else:
                        self.currentMovieForMoreInformation = movieName
                        self.confirm = True
                        self.previousInput = input
                        return random.choice(
                            ConfirmationStrings) % movieName.title()

            else:
                response = random.choice(SameMovieStrings)
        else:
            response = random.choice(WrongFormatStrings)

        return response

    def addRating(self, movieName, string):
        rating = 0
        MoreMoviesStrings = [
            "Thank you! Please tell me about another movie.",
            "Whooo making progress. Give me another one.",
            "Just a few more movies and I will blow your mind with a recommendation. Give me one more."
        ]
        NegationWords = [
            "didn't", "never", "not", "don't", "none", "not", "nobody"
        ]

        strongPositive = [
            "love", "adore", "favorite", "amazing", "incredible", "fantastic"
        ]
        strongNegative = ["awful", "terrible", "hate"]
        strongIntensifiers = ["really", "very", "extremely"]
        confirmingConjunctionList = ["and", "or", "neither", "either", "so"]
        opposingConjunctionList = ["but", "yet"]

        strongPositiveBoolean = False
        strongNegativeBoolean = False
        strongIntensifierBoolean = False

        ReverseBoolean = 1

        for word in string.split():
            if word in NegationWords:
                ReverseBoolean = -1
            if word in strongPositive:
                strongPositiveBoolean = True
            if word in strongNegative:
                strongNegativeBoolean = True
            if word in strongIntensifiers:
                strongIntensifierBoolean = True

            if self.p.stem(word) in self.sentiment:
                if self.sentiment[self.p.stem(word)] == "pos":
                    rating += (1 * ReverseBoolean)

                    if strongIntensifierBoolean:
                        strongPositiveBoolean = True
                        strongIntensifierBoolean = False
                else:
                    rating -= (1 * ReverseBoolean)

                    if strongIntensifiers:
                        strongNegativeBoolean = True
                        strongIntensifierBoolean = False
                ReverseBoolean = 1

        if rating >= 1:
            rating = 1
            strongNegativeBoolean = False
        elif rating < 0:
            rating = -1
            strongPositiveBoolean = False

        if self.TwoMoviesBoolean and self.sentimentOfPreviousMovie == 0:
            self.sentimentOfPreviousMovie = rating

        if rating == 0:
            if self.TwoMoviesBoolean:
                if self.currentConjunction in confirmingConjunctionList:
                    rating = self.sentimentOfPreviousMovie
                elif self.currentConjunction in opposingConjunctionList:
                    rating = -1 * self.sentimentOfPreviousMovie
            else:
                self.inTheMiddleOfSentimentAnalysis = True
                self.currentMovieForMoreInformation = movieName
                response = movieName.title(
                ) + "! I didn't understand if you liked it or not. Tell me more."
                return response

        self.ratedMovieList[movieName] = rating
        self.userRatingVector[self.titlesOnly.index(movieName)] = rating

        if len(self.ratedMovieList) >= 5:
            movieRec = self.recommend(self.userRatingVector).title()
            response = random.choice(
                self.RecommendationStrings
            ) % movieRec + " Tap any key to hear another recommendation. (Or enter :quit if you're done.)"
        else:
            if strongPositiveBoolean == True and strongNegativeBoolean == False:
                response = "Whoa, you really liked that one, huh? Give me another one. "
            elif strongNegativeBoolean == True and strongPositiveBoolean == False:
                response = "Wow, that bad, huh? Give me another one. "
            else:
                response = random.choice(MoreMoviesStrings)

        return response

    def removeTitle(self, movieName, input):
        movieSplit = movieName.split()
        inputSplit = input.lower().split()
        for word in movieSplit:  #remove the movie title from the words
            if word in inputSplit: inputSplit.remove(word)
        input = " ".join(inputSplit)
        return input

    def minimumEditDistance(self, string, userInput, knownMovie):
        userLen = len(userInput)
        movieLen = len(knownMovie)
        concat = "%s %s" % (userInput, knownMovie)
        if userInput == knownMovie:
            return 0
        if userLen == 0:
            return movieLen
        elif movieLen == 0:
            return userLen
        if concat in self.check:
            return self.check[concat]
        else:
            x = min(
                self.minimumEditDistance("first", userInput[:-1], knownMovie) +
                1,
                self.minimumEditDistance("second", knownMovie[:movieLen - 1],
                                         userInput) + 1,
                self.minimumEditDistance("third", userInput[:userLen - 1],
                                         knownMovie[:movieLen - 1]) +
                self.substitution(knownMovie[movieLen - 1],
                                  userInput[userLen - 1]))
            self.check[concat] = x
            return x

    def substitution(self, letterOne, letterTwo):
        if letterOne == letterTwo:
            return 0
        else:
            return 2

    def findPotentialMovie(self, string):
        start = time.time()
        minDist = None
        potentialMovie = None
        for i, title in enumerate(self.titlesOnly):
            if math.fabs(len(string) - len(title)) < 4:
                strSet = set(string)
                titleSet = set(title)
                if len(strSet - titleSet) > 3 or len(titleSet - strSet) > 2:
                    continue
                self.check = {}
                dist = self.minimumEditDistance("zero", string, title)
                if (minDist is None
                        or dist < minDist) and dist < self.distanceThreshold:
                    minDist = dist
                    potentialMovie = title
        return potentialMovie

    #############################################################################
    # 3. Movie Recommendation helper functions                                  #
    #############################################################################

    def read_data(self):
        """Reads the ratings matrix from file"""
        # This matrix has the following shape: num_movies x num_users
        # The values stored in each row i and column j is the rating for
        # movie i by user j
        self.titles, self.ratings = ratings()
        reader = csv.reader(open('data/sentiment.txt', 'rb'))
        self.sentiment = dict(reader)

        self.titlesOnly = []

        for entry in self.titles:
            titleOnly = entry[0].split(' (')[0]
            self.titlesOnly.append(titleOnly.lower())
        self.sentiment.update(
            {self.p.stem(k): v
             for k, v in self.sentiment.items()})

    def binarize(self):
        """Modifies the ratings matrix to make all of the ratings binary"""
        total = 0
        count = 0
        avg_rating = 0
        for movie_id, movie in enumerate(self.ratings):
            for user_id, rating in enumerate(movie):
                if rating != 0:
                    self.ratings[movie_id, user_id] = 1 if rating > 2.5 else -1

    def distance(self, u, v):
        """Calculates a given distance function between vectors u and v"""
        numerator = np.dot(u, v)
        denominator = np.linalg.norm(u) * np.linalg.norm(v)
        similarity = numerator / (denominator + 1e-7)
        return similarity

    def recommend(self, u):
        """Generates a list of movies based on the input vector u using
      collaborative filtering"""

        sims = {}  #similarities
        recommendation = ""
        topScore = None
        start = time.time()
        for movie_id, rating in enumerate(u):
            if rating != 0:
                sims[movie_id] = {}
                for r_id, movie in enumerate(self.ratings):
                    sims[movie_id][r_id] = self.distance(
                        movie, self.ratings[movie_id])
    #   print time.time() - start, "distance time"

        start = time.time()
        for i, movieRating in enumerate(self.ratings):
            iPrediction = 0
            for movieName in self.ratedMovieList:
                j = self.titlesOnly.index(movieName)
                iPrediction += sims[j][i] * 1.0 * self.userRatingVector[j]
            if topScore is None or iPrediction > topScore:
                movie = self.titlesOnly[i]
                if movie not in self.ratedMovieList and movie not in self.recommendedMovies:
                    #   print("prediction score for %s is %.5f" % (movie, iPrediction))
                    topScore = iPrediction
                    recommendation = movie

    #   print time.time() - start, "recommendation time"
        self.recommendedMovies.append(recommendation)

        articlePattern = re.match('(.*), (the|a|an|el|la)', recommendation)
        if articlePattern is not None:
            recommendation = articlePattern.group(
                2) + " " + articlePattern.group(1)

        return recommendation

    def debug(self, input):
        """Returns debug information as a string for the input string from the REPL"""
        # Pass the debug information that you may think is important for your
        # evaluators
        debug_info = 'debug info'
        return debug_info

    #############################################################################
    # 5. Write a description for your chatbot here!                             #
    #############################################################################
    def intro(self):
        return """
      Welcome to our MovieBot! A couple of things to help you out in the processs of using
      it:
        - The only difference between Starter and Creative Modes is Creative mode supports
          Arbitrary Input responses. All other features are supported in Start mode!
        - We implemented the following Creative Mode features:
            - Identifying movies without quotation marks or perfect capitalization
            - Fine-grained sentiment extraction
            - Spell-checking movie titles
            - Extracting sentiment with multiple-movie input
            - Responding to arbitrary input
      Enjoy!
      """

    #############################################################################
    # Auxiliary methods for the chatbot.                                        #
    #                                                                           #
    # DO NOT CHANGE THE CODE BELOW!                                             #
    #                                                                           #
    #############################################################################

    def bot_name(self):
        return self.name