示例#1
0
    def __init__(self, fileA, fileB):
        self.__allWords = set()
        self.__wordsA = dict()
        self.__wordsB = dict()

        with open(fileA, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsA.keys():
                        self.__wordsA[word] += 1
                    else:
                        self.__wordsA[word] = 1

        with open(fileB, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsB.keys():
                        self.__wordsB[word] += 1
                    else:
                        self.__wordsB[word] = 1

        self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys())
        self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}
示例#2
0
 def __init__(self,name,task_queue,result_queue):
     self.name = name
     self.r_stemmer = PorterStemmer()
     self.queue = task_queue
     self.queue2 = result_queue
     #载入停词表
     pre_worker.load_stopwords()
示例#3
0
def stemizeQuery(query):
	# STEMIZER FOR QUERY
	porter = PorterStemmer()
	newList = []
	for q in query:
		newList.append(porter.stem(q,0,len(q)-1))
	return newList
示例#4
0
    def __init__(self, path):
        self.stemmer = PorterStemmer()
        self.path = path

        self.cur_idx = 0
        self.batch = 2
        self.sample = 0.001
        self.vocab_size = 10000

        self.total_count = 0
        self.word_count = Counter()
        self.word2idx = defaultdict(int)
        self.idx2word = {}
        self.word_sample = {}

        self.batch_size = 128
        self.embedding_size = 128  # Dimension of the embedding vector.
        self.skip_window = 3  # How many words to consider left and right.
        self.raw_sample_probs = [0.5, 0.3, 0.2]
        self.sample_probs = []
        sum = 0
        for prob in self.raw_sample_probs:
            sum += prob
            self.sample_probs.append(sum)
        self.num_skips = 2  # How many times to reuse an input to generate a label.

        self.valid_size = 16  # Random set of words to evaluate similarity on.
        self.valid_window = 100  # Only pick dev samples in the head of the distribution.
        #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False)
        self.num_negative_sampled = 64  # Number of negative examples to sample.

        self.batch_index = 0
        self.get_stat()
示例#5
0
def stemize(dictList):
	#STEMIZER FOR DICTIONARY
	porter = PorterStemmer()		
	for dict in dictList:
		for token in dict.keys():
			dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) 
	return dictList
def tokenize(sText, pairing=False):
    '''Given a string of text sText, returns a list of the individual stemmed tokens that 
        occur in that string (in order). This is my quick and dirty Tokenizer. 
        Satisfaction Not Guarenteed'''
    import string
    from stemmer import PorterStemmer
    sText = sText.lower()
    sText = re.sub("’", "'", sText)
    sText = re.sub("&.{0,6};", " ", sText)
    sText = re.sub("[\x80-\xff]", "", sText)
    sText = sText.split(None)
    for p in string.punctuation.replace("'", ""):
        try:
            sText = mapAndFold(lambda x: x.split(p), sText)
        except TypeError:  # empty string
            return []
    sText = mapAndFold(lambda x: x.split(), sText)
    sText = map(lambda x: x.strip("\'"), sText)
    sText = map(lambda x: x.strip("\""), sText)
    sText = map(lambda x: x.strip("_"), sText)
    sText = filter(lambda x: not re.match("\d+", x), sText)
    sText = filter(lambda x: not x == "", sText)
    sText = filter(lambda x: not x[0] == "@", sText)
    stemmer = PorterStemmer()
    if pairing:
        #return original with token val in tuple
        return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText]
    return [stemmer.stem(w, 0, len(w) - 1) for w in sText]
示例#7
0
def GetDataset():
	emails = None
	x_vals = []
	y_vals = []
	stemmer = PorterStemmer()

	with open("pickled_reduced_chains.txt", "rb") as fp1:   # Unpickling
		emails = pickle.load(fp1)
	i = 0
	text_data = []
	for i in range(0, len(emails)):
		print "Evaluation Email %d" % (i)
		email, next_email, time_diff = emails[i]
		print emails[i]
		# Create feature array
		features = []
		# if np.round(time_diff / 60) > 72: continue;
		#Feature 1: Number of to
		features.append(len(email['to']))

		# Feature 2: Num words
		words = email['body'].split()
		features.append(len(words))

		# Feature 3: Number of CC
		features.append(email['cc_count'])

		# Feature 4: is reply
		if email['is_re']:
			features.append(1)
		else:
			features.append(0)

		# Feature 5: Time of Day (minutes)
		date = email['date']['local_date']
		hour = date.hour
		features.append(hour)

		# Feature 6: Length of Subject Line
		subject_words = email['subject'].split()
		features.append(len(subject_words))

		# Feature 7: Day of Week
		features.append(date.weekday())

		# Feature 8: Question marks in Body
		features.append(email['body'].count('?'))

		# Feature 9: Question marksin Subject
		features.append(email['subject'].count('?'))

		x_vals.append(features)

		# Append y_value for training point
		y_vals.append(int(np.round(time_diff / 60)))

	a = np.array(x_vals)
	b = np.array(y_vals)
	return a, b
示例#8
0
def stemWords(listTokens):
	s = PorterStemmer()
	stemmedTerms = []

	for x in listTokens:
		stemmedTerms.append(s.stem(x, 0, len(x) - 1))

	return stemmedTerms
示例#9
0
def stemWords(inList):
    outList = []
    ps = PorterStemmer()

    for token in inList:
        stemmed_token = ps.stem(token, 0, len(token) - 1)
        outList.append(stemmed_token)

    return outList
示例#10
0
def stemWords(tokens):
    """Function that stems the words. """
    # use porter stemmer
    #  https://tartarus.org/martin/PorterStemmer/python.txt

    p = PorterStemmer()
    for index, word in enumerate(tokens):
        tokens[index] = p.stem(word, 0, len(word) - 1)

    return tokens
示例#11
0
def stem(word):
    # word needs to be all lowercase before being passed to stem
    string.lower(word)  

    # fancy stuff to remove .,?!"
    mymatch = re.compile('(\,|\.|\!|\?|\")')
    word = mymatch.sub(r'',word)

    p = PorterStemmer()
    word = p.stem(word, 0,len(word)-1)
   
    return word
示例#12
0
def main():
    array = []
    array2 = []
    p = PorterStemmer()
    with open(sys.argv[1]) as f:
        for line in f:
            if len(line) > 1:
                array.append(line[0:len(line) - 1])
        word_dictionary, tag_dictionary, count = read_input(array, p)

    with open(sys.argv[2]) as f:
        for line in f:
            if len(line) > 1:
                array2.append(line[0:len(line) - 1])
    read_test_data(array2, word_dictionary, tag_dictionary, count)
示例#13
0
def GetWordDictionary(emails):
    word_dict = {}
    count = 0
    stemmer = PorterStemmer()
    for email_case in emails:
        email = email_case[0]
        body = SplitText(email['body'])
        for word in body:
            modified_word = word
            if len(modified_word) > 1:
                modified_word = stemmer.stem(word, 0, len(word) - 1)

            if modified_word not in word_dict:
                word_dict[modified_word] = count
                count += 1
    return word_dict
示例#14
0
def main():
    # Reading the document from the file
    fileName = "cran.all.1400"

    documents = readFromFile(fileName, "r")

    # Reading stop words from the file
    stopwordsList = readFromFile("stopwords.txt", "r")
    stopwords = stopwordsList.split()

    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    # InvFileHash
    invFileHash = {}

    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]
    totalDocument = len(document)
    print "Total documents:", totalDocument
    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word.lower(), 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    # Writes to the Inverted File Hash file
    writeToFile(invFileHash)

    # To read the queries list from the cran query file
    queryFileRead = readFromFile("cran.qry", "r")

    # Calculate the Vector Space Model (total number of documents, stopwords list)
    vectorSpaceModel(totalDocument, queryFileRead, stopwords)
示例#15
0
def classify(folds, nb_or_svm, ngrams, stemming, binary):
    p = PorterStemmer()

    vectorizer = CountVectorizer(input="filename", \
                                 ngram_range=ngrams, \
                                 tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \
                                 binary=binary, \
                                 min_df=4, max_df=1.0)

    X = vectorizer.fit_transform([f[0] for fold in folds for f in fold])

    accuracies = []
    for i in range(len(folds)):
        classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \
                else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10))

        start_index = 0
        for j in range(i):
            start_index += len(folds[j])
        end_index = start_index + len(folds[i])

        test_set = X[start_index:end_index]
        training_set = vstack([X[:start_index], X[end_index:]])
        classifier.fit(
            training_set,
            [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold])

        correct_predictions = 0
        results = classifier.predict(test_set)
        for j in range(len(results)):
            correct_predictions += int(results[j] == folds[i][j][1])

        accuracies.append(100 * correct_predictions / len(results))

    if nb_or_svm[0] != "svm":
        print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ")

    print("stemmed" if stemming else "unstemmed", \
          "presence" if binary else "frequency", \
          "unigrams" if ngrams == (1, 1) else \
          ("bigrams" if ngrams == (2, 2) else \
          ("uni + bi" if ngrams == (1, 2) else "unknown")), \
          "accuracy:", sum(accuracies)/len(accuracies))
示例#16
0
def stemizeList(normalList):
	# STEMIZER FOR LIST
	porter = PorterStemmer()
	newList = []
	newDict = {}
	count = 0;	
	for lists in normalList:
		tokenList = []
		for token in lists:
			#print normalList.index(lists)," ",lists.index(token)
			tokenList.append(porter.stem(token,0,len(token)-1))
			if token in newDict:
				count = newDict[token]
				newDict[token] = count +1
			else:
				newDict[token] = 1
		newList.append(tokenList)
			#token = porter.stem(token,0,len(token)-1)
			
	return newList,newDict
示例#17
0
    def get_postlist(stop_answer, stem_answer, dict_terms):
        if stop_answer == 'no':
            stopwords = []
        if stop_answer == 'yes':
            stopwords = [
                'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by',
                'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or',
                'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where',
                'who', 'will', 'with', 'the'
            ]
        ps = PorterStemmer()
        position_list = []
        dict_posting = {}
        counter = 0
        for key, value in dict_terms.items():
            if isinstance(value, dict):
                for k, v in value.items():
                    if k == 'abstract':
                        val = v.replace(',', '').lower().split()
                        for index, word in enumerate(val):
                            if stem_answer == 'no':
                                stem_word = word
                            if stem_answer == 'yes':
                                stem_word = ps.stem(word, 0, len(word) - 1)
                            if stem_word not in stopwords:
                                if stem_word not in dict_posting:
                                    dict_posting[stem_word] = {}
                                if key not in dict_posting[stem_word]:
                                    dict_posting[stem_word][key] = {}
                                    dict_posting[stem_word][key][
                                        'frequency'] = 0
                                    dict_posting[stem_word][key][
                                        'position'] = []

                                dict_posting[stem_word][key]['frequency'] += 1
                                dict_posting[stem_word][key][
                                    'position'].append(index)
            with open('posting_list.json', 'w') as outfile:
                json.dump(dict_posting, outfile)
            print("Finished writing the posting list")
        return dict_posting
示例#18
0
def GetTFIDF():
	emails = None
	x_vals = []
	y_vals = []
	stemmer = PorterStemmer()
	
	# Get email chains
	with open("balanced_chains.pickle", "rb") as fp1:   # Unpickling
		emails = pickle.load(fp1)

	np.random.shuffle(emails)
	i = 0
	text_data = []
	for i in range(0, len(emails)):
		print "Evaluation Email %d" % (i)
		email, next_email, time_diff, bucket = emails[i]
		# if int(np.round(time_diff / 60)) > 72:
		# 	continue
		# Create stemmed body and append to text_data
		new_str = ""
		words = email['body'].split()
		for word in words:
			new_word = stemmer.stem(word, 0, len(word) - 1)
			new_str += new_word + " "
		new_str = new_str[:-1]
		text_data.append(new_str)

		# Append hour
		y_vals.append(int(np.round(time_diff / 60)))
		#y_vals.append(int(time_diff)

	b = np.array(y_vals)
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(text_data)
	tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)
	return X_train_tf, b, count_vect, tf_transformer, text_data
示例#19
0
def process(text):
    '''Returns a list of words after carying out the
    following text preprocessing and normalization steps'''
    # Convert text to lower case
    text = text.lower()
    #Remove 'Subject'
    text = re.sub(r'^sub(ject)?', ' ', text)
    # Strip HTML
    text = re.sub(r'<.*?>', ' ', text)
    # Normalize URLs
    text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text)
    # Normalize email addresses
    text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text)
    # Normalize numbers
    text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text)
    # Normalize Dollars/Rupees
    text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text)
    # Remove non-word characters
    text = re.sub(r'[^a-z]+', ' ', text)
    # Strip all whitespace characters and generate list of words
    # Stop Word Removal
    # stop_words = pickle.load(open('stopwords_set.pyset', 'rb'))
    text = [
        word for word in text.split()
        if word not in process.stop_words and len(word) > 2
    ]
    # Word Stemming
    p = PorterStemmer()
    result = []
    for word in text:
        try:
            stem_word = p.stem(word, 0, len(word) - 1)
            if stem_word not in process.stop_words:
                result.append(stem_word)
        except:
            pass
    return result
示例#20
0
def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    documentList = []
    docId = 1
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        sortedList = sorted(listWords.items(), key=lambda t: t[0])
        output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList}
        docId += 1
        documentList.append(output)

    for i in range(0, len(documentList)):
        print "Document:", documentList[i][
            'id'], "\nUnique Terms:", documentList[i][
                'unique'], "\nTerms:\n", documentList[i]['terms']
示例#21
0
def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    #InvFileHash
    invFileHash = {}
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    print "File written: output.json"
    print "Number of terms:",len(invFileHash)
    writeToFile(invFileHash)
示例#22
0
def vectorSpaceModel(totalDocument, queryFileRead, stopwords):
    """
    Query to calculate the cosine similarity between document d and Query Q
    """

    # Loads the inverted File Hash
    dic = loadFromFile()
    #
    queryList = processQueryList(queryFileRead)
    # Calculation of Inverse Document Frequency
    IDF = calculateIDF(dic, totalDocument)
    # Calculation of Term Frequency
    TF = calculateTFList(dic)
    # Calculation of Wd from all the Term Frequency calculated
    WD = calculateWD(TF, totalDocument)

    pObj = PorterStemmer()
    fileWrite = open("outputdocument.txt", "w")
    for query in queryList:
        fileWrite.write("\n---------------------------------------------------------------------------------------")
        fileWrite.write("\nQuery: " + query)
        # Separate the string of query into list of words
        listQuery = re.findall(r'\w+', query)
        # Remove the stopwords and numbers from the list of query words
        queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()]
        # Stem the list of query words
        processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword]
        # Calculate the cosine measure (Similarity) for the query
        rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument)
        fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList)))
        fileWrite.write("\nDocument ID:\n")
        fileWrite.write(''.join(str(rankedDocList)))
        fileWrite.write("\n---------------------------------------------------------------------------------------")
    fileWrite.close()

    print "Writing to outputdocument.txt file completes."
示例#23
0
    def buildMatrix(self):
        # use suffix-stripping algorithm to stem word
        porter_strmmer = PorterStemmer()
        for index in range(0,len(self.origin_documents)):
            document = self.origin_documents[index]
            # change document in origin_document array to array of stemmed word
            self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()]

        # use 2000 most frequent words to generate words array
        temp_word = defaultdict(int)
        for document in self.origin_documents:
            for word in document:
                temp_word[word] += 1

        sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1))
        sorted_dict.reverse()
        self.words =  [x[0] for x in sorted_dict[0:self.word_size]]


        # build document array
        for index in range(0, len(self.origin_documents)):
            document = self.origin_documents[index]
            self.documents.append([])
            self.documents[index] = [document.count(word) for word in self.words]

        # print(self.documents[0], sum(self.documents[0]))

        # remove zero sum rows
        zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]
        for value in zeros[::-1]:
            del self.labels[value]
            del self.documents[value]

        # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]

        print(len(self.origin_documents), len(self.words), len(self.documents), self.words)
示例#24
0
def stemWords(input):
    stem = PorterStemmer()
    for index,entries in enumerate(input):
        input[index]=stem.stem(entries,0,len(entries)-1)
    return input
示例#25
0
def GetDataset():
    np.random.shuffle(emails)

    x_vals = []
    y_vals = []

    stemmer = PorterStemmer()
    word_mapping = GetWordDictionary(emails)

    i = 0
    text_data = []
    for i in range(0, len(emails)):
        #print "Evaluation Email %d" % (i)
        # note: time diff in mins
        email, next_email, time_diff, label = emails[i]

        # Create feature array
        features = []

        #Feature 1: Number of to
        features.append(len(email['to']))

        # Feature 2: Num words
        words = email['body'].split()
        lower_case_body = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in words
        ]
        features.append(len(words))

        # Feature 3: Number of CC
        features.append(email['cc_count'])

        # Feature 4: is reply
        if email['is_re']:
            features.append(1)
        else:
            features.append(0)

        # Feature 5: Time of Day (hour)
        date = email['date']['local_date']
        hour = date.hour
        features.append(hour)

        # Feature 6: Length of Subject Line
        subject_words = email['subject'].split()
        lower_case_subject = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in subject_words
        ]
        features.append(len(subject_words))

        # Feature 7: Day of Week
        features.append(date.weekday())

        # Feature 8: # Question marks in Body, bool in body
        features.append(email['body'].count('?'))

        # Feature 9: # Question marks in Subject, bool in subject
        features.append(email['subject'].count('?'))

        # NEW FEATURES

        # boolean: presence of ? in body / header
        features.append(1 if '?' in email['body'] else 0)
        features.append(1 if '?' in email['subject'] else 0)

        # Feature 12-13: "RESPONSE NEEDED" in subject or body
        keywords = ['response', 'please', 'can', 'urgent', 'important', 'need']
        for keyword in keywords:
            stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1)
            features.append(1 if stemmed_keyword in lower_case_subject else 0)
            features.append(1 if stemmed_keyword in lower_case_body else 0)

        x_vals.append(features)
        y_vals.append(label)

    X = np.array(x_vals)
    Y = np.array(y_vals)
    return X, Y
示例#26
0
	iter_ = regexObject.finditer(text)
	for result in iter_:
		if result.group('id') is not None:
			docs.append(Doc(int(result.group('id')))) # add new doc to the `list` with the id
		else:
			# as far as we know, not both of these can be `not none` at the same time
			if result.group('title') is not None:
				docs[-1].appendText(result.group('title').lower() + '\n') # append to the last element docs `list`
			if result.group('body') is not None:
				docs[-1].appendText(result.group('body').lower()) # append to the last element docs `list`
	return docs


stopWords = {'a', 'all', 'an', 'and', 'any', 'are', 'as', 'be', 'been', 'but', 'by ', 'few', 'for', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'i', 'in', 'is', 'it', 'its', 'many', 'me', 'my', 'none', 'of', 'on ', 'or', 'our', 'she', 'some', 'the', 'their', 'them', 'there', 'they', 'that ', 'this', 'us', 'was', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'you', 'your'}

portertStemmer = PorterStemmer()


ro= re.compile(r'\d+\.+\d+|\w+')
""" regex object for finding words and digits and floats containing dot IN them pre-cooked ready to eat"""

def regexStyleGenerator(text):
	"""
	 A generator that returns tokens of given text
	 makes stopword removel and stemming
	:param text: the text to tokenize
	:return: None but yields tokens
	"""
	global  total_num_words_before,total_num_words_after # for statistics
	
	
示例#27
0
DATASETS = "datasets"
TRAINING = DATASETS + "/training.txt"
VALIDATION = DATASETS + "/validation.txt"

SPORTS = 2
POLITICS = 1

HASHTAGS = re.compile("\#([^\s\,\.\#\"\'\+\=\|\$\%\^\:]+)")
URLS1 = re.compile("https?\:\/\/([^\s]+)")
URLS2 = re.compile("www\.([^\s]+)")
REFS = re.compile("\@([^\s\,\.\#\"\'\+\=\|\$\%\^\:\-]+)")
KEYWORDS = re.compile("(\w+)")
WHITESPACE = re.compile(
    "[\s\.\,\'\"\[\]\{\}\;\:\/\&\=\+\-\)\(\*\&\^\%\$\`\|\?\!]+")
STEMMER = PorterStemmer()

idtable = {}


def get_id(table, key, write=True):
    if table.has_key(key): return table[key]
    else:
        if write:
            table[key] = len(table)
            return table[key]
        else:
            return None


def register(table, keys, write=True):
示例#28
0
def run_train_test(training_file, testing_file):

    # Set the variables, params, dicts, sets
    alpha = 0.5

    stop_words = {'the', 'and'}
    logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'}
    Porter_Stemmer = PorterStemmer()

    # Import training dataset
    training_start_time = time.time()
    vocab = set(['positive-words', 'negative-words'])
    wordcount_class_0 = {'positive-words': 0, 'negative-words': 0}
    wordcount_class_1 = {'positive-words': 0, 'negative-words': 0}
    total_reviews = 0
    reviewscount_0 = 0
    reviewscount_1 = 0
    train_labels = []
    train_reviews = []
    with training_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            total_reviews += 1

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)

            # words = set(words)
            vocab.update(words)

            for word in words:
                if word not in wordcount_class_0.keys():
                    wordcount_class_0[word] = 0
                    wordcount_class_1[word] = 0

            if label == 0:
                reviewscount_0 += 1
                for word in words:
                    wordcount_class_0[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            if label == 1:
                reviewscount_1 += 1
                for word in words:
                    wordcount_class_1[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            train_labels.append(label)
            train_reviews.append(words)

    # Compute CPTs
    P_class = [0, 0]
    P_class[0] = reviewscount_0 / total_reviews
    P_class[1] = reviewscount_1 / total_reviews

    P_words_class_0 = {}
    P_words_class_1 = {}
    bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab)
    bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab)
    for word in vocab:
        if word in stop_words:
            P_words_class_0[word] = (0 + alpha) / bottom_0
            P_words_class_1[word] = (0 + alpha) / bottom_1
        else:
            P_words_class_0[word] = (wordcount_class_0[word] +
                                     alpha) / bottom_0
            P_words_class_1[word] = (wordcount_class_1[word] +
                                     alpha) / bottom_1

    # Inference on the training dataset
    predict_train_labels = []
    for doc in train_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])

            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])

        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        predict_train_labels.append(c)

    # Compute training accuracy
    correct = 0
    for i in range(len(train_labels)):
        if predict_train_labels[i] == train_labels[i]:
            correct += 1
    train_accuracy = correct / len(train_labels)

    training_time = time.time() - training_start_time

    # Import testing dataset
    testing_start_time = time.time()
    test_reviews = []
    test_labels = []
    with testing_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)
            # words = set(words)

            test_labels.append(label)
            test_reviews.append(words)

    # Inference on the testing dataset
    predict_test_labels = []
    for doc in test_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        bag_of_words = vocab.intersection(bag_of_words)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])
            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        # print(c)
        predict_test_labels.append(c)

    # Compute testing accuracy
    correct = 0
    for i in range(len(test_labels)):
        if predict_test_labels[i] == test_labels[i]:
            correct += 1
    test_accuracy = correct / len(test_labels)

    # Print results
    testing_time = time.time() - testing_start_time
    print(round(training_time), 'seconds (training)')
    print(round(testing_time), 'seconds (labeling)')
    print(round(train_accuracy, 3), '(training)')
    print(round(test_accuracy, 3), '(testing)')
    print(len(vocab))

    return
示例#29
0
 def stemming(self, term):
     output = ""
     stem1 = PorterStemmer()
     output = stem1.stem(term)
     return output
示例#30
0
def preprocessing(file_string):
	try:
		f = open(file_string)
		email_contents = f.read()
		#
		#
		#2 去标题
		#
		cut = re.search(r'\n[ \t]*\n',email_contents).span()[1] - 1
		#      \n...\n
		# cut        ↑
		email_contents = email_contents[cut:]
		#
		#
		#3 其他预处理
		#
		#小写 √
		email_contents = email_contents.lower()
		#删HTML标签 √
		#
		email_contents = re.sub(r'<[^<>]+>',' ',email_contents);
		# URL链接替换 → httpaddr √
		# http//https :// xx.xxx.xxx
		email_contents = re.sub(r'http\:\/\/[\w\.\/]+|https\:\/\/[\w\.\/]+','httpaddr',email_contents)
		# 
		# URL邮件地址替换 → emailaddr
		#
		email_contents = re.sub(r'[\w\-\_]+\@[\w]+\.[\w]+','emailaddr',email_contents)
		# 数字替换 →  number  √
		#    整数  小数
		email_contents = re.sub(r'(\d+|\d+\.\d+)','number',email_contents)
		# 美元替换 $ → dollar   √
		# 
		email_contents = re.sub(r'\$','dollar',email_contents)
		# 单词词干化
		#         实现复杂
		# 去除非单词和标点
		# 
		email_contents = re.sub(r'[\W]',' ',email_contents)
		# 删去单字母
		for i in range(50):
			email_contents = re.sub(r' [a-z] ',' ',email_contents)
		# 删去多余空格
		email_contents = re.sub(r'[\t\n ]+',' ',email_contents)
		#

		# from stemmer import PorterStemmer
		# stemmer = PorterStemmer()
		# 
		from stemmer import PorterStemmer
		stemmer = PorterStemmer()
		email_contents = stemmer.stem(email_contents)
		# 
		
		word_list = re.findall(r'\w+',email_contents)
		# return word_list
		for word in word_list:
			if(word_frequency.get(word,"None") == "None"):
				word_frequency[word] = 1;
			else:
				word_frequency[word] += 1;
		# return word_frequency
	except:
		print (file_string + "  ERROR")
		pass