Python PorterStemmer 예제들, stemmer.PorterStemmer Python 예제들

예제 #1

0

파일 보기

파일: w2v.py 프로젝트: khushsi/convept2Vec

    def __init__(self, path):
        self.stemmer = PorterStemmer()
        self.path = path

        self.cur_idx = 0
        self.batch = 2
        self.sample = 0.001
        self.vocab_size = 10000

        self.total_count = 0
        self.word_count = Counter()
        self.word2idx = defaultdict(int)
        self.idx2word = {}
        self.word_sample = {}

        self.batch_size = 128
        self.embedding_size = 128  # Dimension of the embedding vector.
        self.skip_window = 3  # How many words to consider left and right.
        self.raw_sample_probs = [0.5, 0.3, 0.2]
        self.sample_probs = []
        sum = 0
        for prob in self.raw_sample_probs:
            sum += prob
            self.sample_probs.append(sum)
        self.num_skips = 2  # How many times to reuse an input to generate a label.

        self.valid_size = 16  # Random set of words to evaluate similarity on.
        self.valid_window = 100  # Only pick dev samples in the head of the distribution.
        #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False)
        self.num_negative_sampled = 64  # Number of negative examples to sample.

        self.batch_index = 0
        self.get_stat()

예제 #2

0

파일 보기

def stemize(dictList):
	#STEMIZER FOR DICTIONARY
	porter = PorterStemmer()		
	for dict in dictList:
		for token in dict.keys():
			dict[porter.stem(token,0,len(token)-1)] = dict.pop(token) 
	return dictList

예제 #3

0

파일 보기

 def __init__(self,name,task_queue,result_queue):
     self.name = name
     self.r_stemmer = PorterStemmer()
     self.queue = task_queue
     self.queue2 = result_queue
     #载入停词表
     pre_worker.load_stopwords()

예제 #4

0

파일 보기

파일: useful.py 프로젝트: eomernick/Corpora-Analysis-Toolkit

def tokenize(sText, pairing=False):
    '''Given a string of text sText, returns a list of the individual stemmed tokens that 
        occur in that string (in order). This is my quick and dirty Tokenizer. 
        Satisfaction Not Guarenteed'''
    import string
    from stemmer import PorterStemmer
    sText = sText.lower()
    sText = re.sub("&#8217;", "'", sText)
    sText = re.sub("&.{0,6};", " ", sText)
    sText = re.sub("[\x80-\xff]", "", sText)
    sText = sText.split(None)
    for p in string.punctuation.replace("'", ""):
        try:
            sText = mapAndFold(lambda x: x.split(p), sText)
        except TypeError:  # empty string
            return []
    sText = mapAndFold(lambda x: x.split(), sText)
    sText = map(lambda x: x.strip("\'"), sText)
    sText = map(lambda x: x.strip("\""), sText)
    sText = map(lambda x: x.strip("_"), sText)
    sText = filter(lambda x: not re.match("\d+", x), sText)
    sText = filter(lambda x: not x == "", sText)
    sText = filter(lambda x: not x[0] == "@", sText)
    stemmer = PorterStemmer()
    if pairing:
        #return original with token val in tuple
        return [(w, stemmer.stem(w, 0, len(w) - 1)) for w in sText]
    return [stemmer.stem(w, 0, len(w) - 1) for w in sText]

예제 #5

0

파일 보기

파일: text_connection_extract.py 프로젝트: master-patch/learning_from_questions

def ReadInfoFile(infoFile):
    global text2pddl;

    fin = open(infoFile)
    lines = fin.readlines()
    fin.close()
    lines = [ line.strip() for line in lines ];

    unitDict = {}
    p = PorterStemmer()
    for line in lines:
        if len(line)==0: continue;
        parts = line.split(':');
        textName = parts[0];
        words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ]
        textName = ' '.join(words);
        if len(parts)>1:
            pddlName = parts[1];
        else:
            pddlName = parts[0];
        assert(pddlName!='');
        assert(textName!='');
        for word in words:
            unitDict[word] = True

    #print unitDict.keys()

    return unitDict

예제 #6

0

파일 보기

def stemizeQuery(query):
	# STEMIZER FOR QUERY
	porter = PorterStemmer()
	newList = []
	for q in query:
		newList.append(porter.stem(q,0,len(q)-1))
	return newList

예제 #7

0

파일 보기

파일: useful.py 프로젝트: eomernick/Corpora-Analysis-Toolkit

def tokenize(sText, pairing = False):
    '''Given a string of text sText, returns a list of the individual stemmed tokens that 
        occur in that string (in order). This is my quick and dirty Tokenizer. 
        Satisfaction Not Guarenteed'''
    import string
    from stemmer import PorterStemmer
    sText = sText.lower()
    sText = re.sub("&#8217;", "'", sText)
    sText = re.sub("&.{0,6};", " ", sText)
    sText = re.sub("[\x80-\xff]", "", sText)
    sText = sText.split(None)
    for p in string.punctuation.replace("'", ""):
        try:
            sText = mapAndFold(lambda x: x.split(p), sText)
        except TypeError: # empty string
            return []
    sText = mapAndFold(lambda x: x.split(), sText)
    sText = map(lambda x: x.strip("\'"), sText)
    sText = map(lambda x: x.strip("\""), sText)
    sText = map(lambda x: x.strip("_"), sText)
    sText = filter(lambda x: not re.match("\d+", x), sText)
    sText = filter(lambda x: not x == "", sText)
    sText = filter(lambda x: not x[0] == "@", sText)
    stemmer = PorterStemmer()
    if pairing:
        #return original with token val in tuple
        return [(w,stemmer.stem(w, 0, len(w)-1)) for w in sText]
    return [stemmer.stem(w, 0, len(w)-1) for w in sText]

예제 #8

0

파일 보기

파일: preprocess.py 프로젝트: sgupta654/486finalproject

def stemWords(listTokens):
	s = PorterStemmer()
	stemmedTerms = []

	for x in listTokens:
		stemmedTerms.append(s.stem(x, 0, len(x) - 1))

	return stemmedTerms

예제 #9

0

파일 보기

def stemWords(inList):
    outList = []
    ps = PorterStemmer()

    for token in inList:
        stemmed_token = ps.stem(token, 0, len(token) - 1)
        outList.append(stemmed_token)

    return outList

예제 #10

0

파일 보기

파일: tools.py 프로젝트: davedash/mealadvisor

def stem_phrase(phrase):
    words = phrase.lower().replace('.', '').replace("'", '').split()

    # ignore stop words
    words = [word for word in words if not word in STOP_WORDS]

    p = PorterStemmer()

    return [p.stem(word, 0, len(word)-1) for word in words]

예제 #11

0

파일 보기

def stemWords(tokens):
    """Function that stems the words. """
    # use porter stemmer
    #  https://tartarus.org/martin/PorterStemmer/python.txt

    p = PorterStemmer()
    for index, word in enumerate(tokens):
        tokens[index] = p.stem(word, 0, len(word) - 1)

    return tokens

예제 #12

0

파일 보기

def stem(word):
    # word needs to be all lowercase before being passed to stem
    string.lower(word)  

    # fancy stuff to remove .,?!"
    mymatch = re.compile('(\,|\.|\!|\?|\")')
    word = mymatch.sub(r'',word)

    p = PorterStemmer()
    word = p.stem(word, 0,len(word)-1)
   
    return word

예제 #13

0

파일 보기

파일: text_connection_extract.py 프로젝트: master-patch/learning_from_questions

def splitToken(token,isStem=True):
    toks = token.split('_')
    word = toks[0].lower()
    tag = toks[1]
    if not word.isalnum():
        tag = 'PUNC'
    if isStem:
        # simple post stem
        p = PorterStemmer()
        #word = p.stem1(word,0,len(word)-1)
        word = p.stem(word,0,len(word)-1)

    return (word, tag)

예제 #14

0

파일 보기

파일: w2v.py 프로젝트: Sanqiang/entity2vector

    def __init__(self, path):
        self.stemmer = PorterStemmer()
        self.path =  path

        self.cur_idx = 0
        self.batch = 2
        self.sample = 0.001
        self.vocab_size = 10000

        self.total_count = 0
        self.word_count = Counter()
        self.word2idx = defaultdict(int)
        self.idx2word = {}
        self.word_sample = {}

        self.batch_size = 128
        self.embedding_size = 128  # Dimension of the embedding vector.
        self.skip_window = 3  # How many words to consider left and right.
        self.raw_sample_probs = [0.5, 0.3, 0.2]
        self.sample_probs = []
        sum = 0
        for prob in self.raw_sample_probs:
            sum += prob
            self.sample_probs.append(sum)
        self.num_skips = 2  # How many times to reuse an input to generate a label.

        self.valid_size = 16  # Random set of words to evaluate similarity on.
        self.valid_window = 100  # Only pick dev samples in the head of the distribution.
        #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False)
        self.num_negative_sampled = 64  # Number of negative examples to sample.

        self.batch_index = 0
        self.get_stat()

예제 #15

0

파일 보기

def GetWordDictionary(emails):
    word_dict = {}
    count = 0
    stemmer = PorterStemmer()
    for email_case in emails:
        email = email_case[0]
        body = SplitText(email['body'])
        for word in body:
            modified_word = word
            if len(modified_word) > 1:
                modified_word = stemmer.stem(word, 0, len(word) - 1)

            if modified_word not in word_dict:
                word_dict[modified_word] = count
                count += 1
    return word_dict

예제 #16

0

파일 보기

def GetDataset():
	emails = None
	x_vals = []
	y_vals = []
	stemmer = PorterStemmer()

	with open("pickled_reduced_chains.txt", "rb") as fp1:   # Unpickling
		emails = pickle.load(fp1)
	i = 0
	text_data = []
	for i in range(0, len(emails)):
		print "Evaluation Email %d" % (i)
		email, next_email, time_diff = emails[i]
		print emails[i]
		# Create feature array
		features = []
		# if np.round(time_diff / 60) > 72: continue;
		#Feature 1: Number of to
		features.append(len(email['to']))

		# Feature 2: Num words
		words = email['body'].split()
		features.append(len(words))

		# Feature 3: Number of CC
		features.append(email['cc_count'])

		# Feature 4: is reply
		if email['is_re']:
			features.append(1)
		else:
			features.append(0)

		# Feature 5: Time of Day (minutes)
		date = email['date']['local_date']
		hour = date.hour
		features.append(hour)

		# Feature 6: Length of Subject Line
		subject_words = email['subject'].split()
		features.append(len(subject_words))

		# Feature 7: Day of Week
		features.append(date.weekday())

		# Feature 8: Question marks in Body
		features.append(email['body'].count('?'))

		# Feature 9: Question marksin Subject
		features.append(email['subject'].count('?'))

		x_vals.append(features)

		# Append y_value for training point
		y_vals.append(int(np.round(time_diff / 60)))

	a = np.array(x_vals)
	b = np.array(y_vals)
	return a, b

예제 #17

0

파일 보기

파일: Assignment6.py 프로젝트: ashishtam/Vector-Space-Model

def main():
    # Reading the document from the file
    fileName = "cran.all.1400"

    documents = readFromFile(fileName, "r")

    # Reading stop words from the file
    stopwordsList = readFromFile("stopwords.txt", "r")
    stopwords = stopwordsList.split()

    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    # InvFileHash
    invFileHash = {}

    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]
    totalDocument = len(document)
    print "Total documents:", totalDocument
    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word.lower(), 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    # Writes to the Inverted File Hash file
    writeToFile(invFileHash)

    # To read the queries list from the cran query file
    queryFileRead = readFromFile("cran.qry", "r")

    # Calculate the Vector Space Model (total number of documents, stopwords list)
    vectorSpaceModel(totalDocument, queryFileRead, stopwords)

예제 #18

0

파일 보기

def classify(folds, nb_or_svm, ngrams, stemming, binary):
    p = PorterStemmer()

    vectorizer = CountVectorizer(input="filename", \
                                 ngram_range=ngrams, \
                                 tokenizer=(lambda d: [(p.stem(t, 0, len(t)-1) if stemming else t) for t in d.split()]), \
                                 binary=binary, \
                                 min_df=4, max_df=1.0)

    X = vectorizer.fit_transform([f[0] for fold in folds for f in fold])

    accuracies = []
    for i in range(len(folds)):
        classifier = SVC(gamma="auto", kernel="linear") if nb_or_svm[0] == "svm" \
                else MultinomialNB(alpha=(1.0 if nb_or_svm[1] else 1.0e-10))

        start_index = 0
        for j in range(i):
            start_index += len(folds[j])
        end_index = start_index + len(folds[i])

        test_set = X[start_index:end_index]
        training_set = vstack([X[:start_index], X[end_index:]])
        classifier.fit(
            training_set,
            [f[1] for fold in (folds[:i] + folds[i + 1:]) for f in fold])

        correct_predictions = 0
        results = classifier.predict(test_set)
        for j in range(len(results)):
            correct_predictions += int(results[j] == folds[i][j][1])

        accuracies.append(100 * correct_predictions / len(results))

    if nb_or_svm[0] != "svm":
        print("smoothed" if nb_or_svm[1] else "unsmoothed", end=" ")

    print("stemmed" if stemming else "unstemmed", \
          "presence" if binary else "frequency", \
          "unigrams" if ngrams == (1, 1) else \
          ("bigrams" if ngrams == (2, 2) else \
          ("uni + bi" if ngrams == (1, 2) else "unknown")), \
          "accuracy:", sum(accuracies)/len(accuracies))

예제 #19

0

파일 보기

파일: obj_extract.py 프로젝트: master-patch/learning_from_questions

def GetPddlObj(_sWord):
    global text2pddl;
    if text2pddl == None:
        ReadMinecraftDict(minecraftDictFile);

    setObjs = set();
    p = PorterStemmer();
    sLastWord = p.stem(_sWord.lower(), 0, len(_sWord)-1);
    if sLastWord == 'block': return setObjs;
    #print sLastWord;
    for sTextName in text2pddl.keys():
        if text2pddl[sTextName] == 'NULL': continue;
        lstWords = sTextName.split(' ');
        sLastTextWord = lstWords[len(lstWords)-1];
        if sLastTextWord == 'block':
            if len(lstWords) == 1: continue;
            sLastTextWord = lstWords[len(lstWords)-2];
        if sLastTextWord == sLastWord:
            setObjs.add(text2pddl[sTextName]);
    return list(setObjs);

예제 #20

0

파일 보기

def stemizeList(normalList):
	# STEMIZER FOR LIST
	porter = PorterStemmer()
	newList = []
	newDict = {}
	count = 0;	
	for lists in normalList:
		tokenList = []
		for token in lists:
			#print normalList.index(lists)," ",lists.index(token)
			tokenList.append(porter.stem(token,0,len(token)-1))
			if token in newDict:
				count = newDict[token]
				newDict[token] = count +1
			else:
				newDict[token] = 1
		newList.append(tokenList)
			#token = porter.stem(token,0,len(token)-1)
			
	return newList,newDict

예제 #21

0

파일 보기

파일: test.py 프로젝트: nreshel/Text-Parser

    def get_postlist(stop_answer, stem_answer, dict_terms):
        if stop_answer == 'no':
            stopwords = []
        if stop_answer == 'yes':
            stopwords = [
                'i', 'a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'by',
                'for', 'from', 'how', 'in', 'is', 'it', 'of', 'on', 'or',
                'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where',
                'who', 'will', 'with', 'the'
            ]
        ps = PorterStemmer()
        position_list = []
        dict_posting = {}
        counter = 0
        for key, value in dict_terms.items():
            if isinstance(value, dict):
                for k, v in value.items():
                    if k == 'abstract':
                        val = v.replace(',', '').lower().split()
                        for index, word in enumerate(val):
                            if stem_answer == 'no':
                                stem_word = word
                            if stem_answer == 'yes':
                                stem_word = ps.stem(word, 0, len(word) - 1)
                            if stem_word not in stopwords:
                                if stem_word not in dict_posting:
                                    dict_posting[stem_word] = {}
                                if key not in dict_posting[stem_word]:
                                    dict_posting[stem_word][key] = {}
                                    dict_posting[stem_word][key][
                                        'frequency'] = 0
                                    dict_posting[stem_word][key][
                                        'position'] = []

                                dict_posting[stem_word][key]['frequency'] += 1
                                dict_posting[stem_word][key][
                                    'position'].append(index)
            with open('posting_list.json', 'w') as outfile:
                json.dump(dict_posting, outfile)
            print("Finished writing the posting list")
        return dict_posting

예제 #22

0

파일 보기

파일: utils.py 프로젝트: pronei/PlagarismCheck

    def __init__(self, fileA, fileB):
        self.__allWords = set()
        self.__wordsA = dict()
        self.__wordsB = dict()

        with open(fileA, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsA.keys():
                        self.__wordsA[word] += 1
                    else:
                        self.__wordsA[word] = 1

        with open(fileB, 'r') as document:
            for line in document:
                words = line.strip().split()
                for word in words:
                    p = PorterStemmer()
                    word = p.stem(word, 0, len(word) - 1)
                    if word in self.__wordsB.keys():
                        self.__wordsB[word] += 1
                    else:
                        self.__wordsB[word] = 1

        self.__allWords = set(self.__wordsA.keys()) | set(self.__wordsB.keys())
        self.__table = {t[1]: t[0] for t in enumerate(self.__allWords)}

예제 #23

0

파일 보기

파일: obj_extract.py 프로젝트: master-patch/learning_from_questions

def ReadMinecraftDict(infoFile):
    global text2pddl;

    fin = open(infoFile)
    lines = fin.readlines()
    fin.close()
    lines = [ line.strip() for line in lines ];

    text2pddl = {}
    p = PorterStemmer()
    for line in lines:
        if len(line)==0: continue;
        parts = line.split(':');
        textName = parts[0];
        words = [ p.stem(w.lower(),0,len(w)-1) for w in textName.split() ]
        textName = ' '.join(words);
        if len(parts)>1:
            pddlName = parts[1];
        else:
            pddlName = parts[0];
        assert(pddlName!='');
        assert(textName!='');
        text2pddl[textName] = pddlName;

예제 #24

0

파일 보기

def process(text):
    '''Returns a list of words after carying out the
    following text preprocessing and normalization steps'''
    # Convert text to lower case
    text = text.lower()
    #Remove 'Subject'
    text = re.sub(r'^sub(ject)?', ' ', text)
    # Strip HTML
    text = re.sub(r'<.*?>', ' ', text)
    # Normalize URLs
    text = re.sub(r'(http|https|ftp)://\S*', ' httpaddr ', text)
    # Normalize email addresses
    text = re.sub(r'[\w.+-]+@[\w.-]+', ' emailaddr ', text)
    # Normalize numbers
    text = re.sub(r'\b\d[\d,]*[.]*[\d]*\b', ' number ', text)
    # Normalize Dollars/Rupees
    text = re.sub(r'(\$|\brs\b|₹|£)+', ' dollar ', text)
    # Remove non-word characters
    text = re.sub(r'[^a-z]+', ' ', text)
    # Strip all whitespace characters and generate list of words
    # Stop Word Removal
    # stop_words = pickle.load(open('stopwords_set.pyset', 'rb'))
    text = [
        word for word in text.split()
        if word not in process.stop_words and len(word) > 2
    ]
    # Word Stemming
    p = PorterStemmer()
    result = []
    for word in text:
        try:
            stem_word = p.stem(word, 0, len(word) - 1)
            if stem_word not in process.stop_words:
                result.append(stem_word)
        except:
            pass
    return result

예제 #25

0

파일 보기

def GetTFIDF():
	emails = None
	x_vals = []
	y_vals = []
	stemmer = PorterStemmer()
	
	# Get email chains
	with open("balanced_chains.pickle", "rb") as fp1:   # Unpickling
		emails = pickle.load(fp1)

	np.random.shuffle(emails)
	i = 0
	text_data = []
	for i in range(0, len(emails)):
		print "Evaluation Email %d" % (i)
		email, next_email, time_diff, bucket = emails[i]
		# if int(np.round(time_diff / 60)) > 72:
		# 	continue
		# Create stemmed body and append to text_data
		new_str = ""
		words = email['body'].split()
		for word in words:
			new_word = stemmer.stem(word, 0, len(word) - 1)
			new_str += new_word + " "
		new_str = new_str[:-1]
		text_data.append(new_str)

		# Append hour
		y_vals.append(int(np.round(time_diff / 60)))
		#y_vals.append(int(time_diff)

	b = np.array(y_vals)
	count_vect = CountVectorizer()
	X_train_counts = count_vect.fit_transform(text_data)
	tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)
	return X_train_tf, b, count_vect, tf_transformer, text_data

예제 #26

0

파일 보기

    def buildMatrix(self):
        # use suffix-stripping algorithm to stem word
        porter_strmmer = PorterStemmer()
        for index in range(0,len(self.origin_documents)):
            document = self.origin_documents[index]
            # change document in origin_document array to array of stemmed word
            self.origin_documents[index] = [porter_strmmer.stem(x, 0, len(x) - 1) for x in document.split()]

        # use 2000 most frequent words to generate words array
        temp_word = defaultdict(int)
        for document in self.origin_documents:
            for word in document:
                temp_word[word] += 1

        sorted_dict = sorted(temp_word.items(), key=operator.itemgetter(1))
        sorted_dict.reverse()
        self.words =  [x[0] for x in sorted_dict[0:self.word_size]]


        # build document array
        for index in range(0, len(self.origin_documents)):
            document = self.origin_documents[index]
            self.documents.append([])
            self.documents[index] = [document.count(word) for word in self.words]

        # print(self.documents[0], sum(self.documents[0]))

        # remove zero sum rows
        zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]
        for value in zeros[::-1]:
            del self.labels[value]
            del self.documents[value]

        # zeros = [i for i, value in enumerate(self.documents) if sum(value) == 0]

        print(len(self.origin_documents), len(self.words), len(self.documents), self.words)

예제 #27

0

파일 보기

파일: Assignment6.py 프로젝트: ashishtam/Vector-Space-Model

def vectorSpaceModel(totalDocument, queryFileRead, stopwords):
    """
    Query to calculate the cosine similarity between document d and Query Q
    """

    # Loads the inverted File Hash
    dic = loadFromFile()
    #
    queryList = processQueryList(queryFileRead)
    # Calculation of Inverse Document Frequency
    IDF = calculateIDF(dic, totalDocument)
    # Calculation of Term Frequency
    TF = calculateTFList(dic)
    # Calculation of Wd from all the Term Frequency calculated
    WD = calculateWD(TF, totalDocument)

    pObj = PorterStemmer()
    fileWrite = open("outputdocument.txt", "w")
    for query in queryList:
        fileWrite.write("\n---------------------------------------------------------------------------------------")
        fileWrite.write("\nQuery: " + query)
        # Separate the string of query into list of words
        listQuery = re.findall(r'\w+', query)
        # Remove the stopwords and numbers from the list of query words
        queryWithoutStopword = [x for x in listQuery if x not in stopwords and x.isalpha()]
        # Stem the list of query words
        processedQuery = [pObj.stem(x.lower(), 0, len(x) - 1) for x in queryWithoutStopword]
        # Calculate the cosine measure (Similarity) for the query
        rankedDocList = calculateSimilarity(processedQuery, IDF, WD, totalDocument)
        fileWrite.write("\nTotal number of documents retrieved: " + str(len(rankedDocList)))
        fileWrite.write("\nDocument ID:\n")
        fileWrite.write(''.join(str(rankedDocList)))
        fileWrite.write("\n---------------------------------------------------------------------------------------")
    fileWrite.close()

    print "Writing to outputdocument.txt file completes."

예제 #28

0

파일 보기

파일: Assignment3.py 프로젝트: ashishtam/Document-Parser

def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    documentList = []
    docId = 1
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        sortedList = sorted(listWords.items(), key=lambda t: t[0])
        output = {'id': docId, 'unique': len(sortedList), 'terms': sortedList}
        docId += 1
        documentList.append(output)

    for i in range(0, len(documentList)):
        print "Document:", documentList[i][
            'id'], "\nUnique Terms:", documentList[i][
                'unique'], "\nTerms:\n", documentList[i]['terms']

예제 #29

0

파일 보기

def main():
    array = []
    array2 = []
    p = PorterStemmer()
    with open(sys.argv[1]) as f:
        for line in f:
            if len(line) > 1:
                array.append(line[0:len(line) - 1])
        word_dictionary, tag_dictionary, count = read_input(array, p)

    with open(sys.argv[2]) as f:
        for line in f:
            if len(line) > 1:
                array2.append(line[0:len(line) - 1])
    read_test_data(array2, word_dictionary, tag_dictionary, count)

예제 #30

0

파일 보기

def main():
    # Reading the document from the file
    file = open("cran.all.1400", "r")
    documents = file.read()
    # Reading stop words from the file
    fileStopwords = open('stopwords.txt', 'r')
    stopwordsList = fileStopwords.read()
    stopwords = stopwordsList.split()
    # List that maintains the document id number, number of unique terms in document, for each term in the document, its term and it's term frequency.
    docId = 1

    #InvFileHash
    invFileHash = {}
    # Splits the multiple documents of the same file into list
    document = re.split(".I | \n.I", documents)[1:]

    for doc in enumerate(document):
        startIndex = doc[1].index('.W\n')
        text = doc[1][startIndex + 3:]
        words = re.findall(r'\w+', text)

        pObj = PorterStemmer()
        listWords = {}
        for word in words:
            flagStopwords = word.lower() in stopwords
            if (not flagStopwords and word.isalpha()):
                stemWord = pObj.stem(word, 0, len(word) - 1)
                listWords = addToDict(listWords, stemWord)

        docList = addDoc(docId, listWords)
        docId += 1
        invFileHash = createInvFileHash(invFileHash, docList)

    print "File written: output.json"
    print "Number of terms:",len(invFileHash)
    writeToFile(invFileHash)

예제 #31

0

파일 보기

파일: crawler.py 프로젝트: jstumbaugh/web_crawler

    def __init__(self):
    	"""
    	Author: Nicole

    	This method declares the list stopwords, dictionaries all_words and
    	all_words_freq, as well as the PorterStemmer object.
    	"""
        self.stopwords = []
        self.p = PorterStemmer()
        self.all_words = {}
        self.all_words_freq = {}
        self.tfidf = {}
        self.vocabulary = []
        self.doc_term_matrix = [[0] * 23 for n in range(809)]
        self.docs = {}
        self.visited = []

예제 #32

0

파일 보기

파일: word.py 프로젝트: powerllamas/EZI

class Cleaner(object):

    def __init__(self, stopwords):
        self.stopwords = stopwords
        self.stemmer = PorterStemmer()

    def clean_word(self, word):
        word = word.strip().lower()
        word = filter(lambda c: c.isalnum(), word)
        if word in self.stopwords:
            word = None
        else:
            word = self.stemmer.stem(word, 0, len(word) - 1)
        return word

    def clean_wordlist(self, wordlist):
        wordlist = " ".join(wordlist).replace('-', ' ').split()
        clean_list = map(lambda x: self.clean_word(x), wordlist)
        return [word for word in clean_list if word]

    @staticmethod
    def make_printable(phrase):
        return filter(lambda c: c in string.printable, phrase)

예제 #33

0

파일 보기

파일: indexer.py 프로젝트: dre21/mongodb_ft

class Indexer():

        # remove stop words and do stemming
        
        STOP_WORD_LIST = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","just","keep","keeps","kept","know","knows","known","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","que","quite","qv","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","value","various","very","via","viz","vs","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","zero"]
        
        def __init__(self):
                logging.debug('Indexer => Init params:self')
                self.idx_fields = []            # field of document to be indexed
                #self.STOP_WORD_LIST = []
                self.P = PorterStemmer()

        # end of function
        '''
        def set_stop_words(self,stop_word_list):
                self.STOP_WORD_LIST = stop_word_list
        # end of function
        '''

        def set_idx_fields(self,fields):
                logging.debug('Indexer => set_idx_fields fields:' + str(fields))
                self.idx_fields = fields
                
        def add_idx_field(self,field_name):
                self.idx_fields.append(field_name)
        
        def clean(self,word):
                #preprocess word
                word = word.lower()
                word = word.strip("\n\t,.(){}?!;'")
                
                if word not in self.STOP_WORD_LIST:
                        word = self.P.stem(word,0,len(word)-1)
                else:
                        word = ""

                return word
        # end of function
        
        def tokenize(self, text):
                #list
                word_idx = []
                        
                # split lines
                lines = text.split('\n')
                
                for line in lines:
                        # split words
                        words = line.split(' ')
                        for word in words:
                                word = self.clean(word)                         
                                if len(word) > 1:
                                        word_idx.append(word)

                # make a set (remove duplicate)
                word_idx = set(word_idx)

                return word_idx
        # end of function
        
        def index(self, document):
                if isinstance(document,list): document = document[0]
                text = ""
                # get text from document to be indexed
                for field in self.idx_fields:
                        text += document[field] + " "
        
                return self.tokenize(text)
        
        def stem(self, words):
                return [self.tokenize(word) for word in words]

예제 #34

0

파일 보기

파일: indexer.py 프로젝트: dre21/mongodb_ft

 def __init__(self):
         logging.debug('Indexer => Init params:self')
         self.idx_fields = []            # field of document to be indexed
         #self.STOP_WORD_LIST = []
         self.P = PorterStemmer()

예제 #35

0

파일 보기

 def stemming(self, term):
     output = ""
     stem1 = PorterStemmer()
     output = stem1.stem(term)
     return output

예제 #36

0

파일 보기

파일: help.py 프로젝트: shoosen/ibid

 def stemWord(self, word):
     return PorterStemmer.stem(self, word, 0, len(word) - 1)

예제 #37

0

파일 보기

파일: help.py 프로젝트: shoosen/ibid

 def __init__(self, language):
     PorterStemmer.__init__(self)

예제 #38

0

파일 보기

파일: help.py 프로젝트: B-Rich/ibid-1

 def stemWord(self, word):
     return PorterStemmer.stem(self, word, 0, len(word) - 1)

예제 #39

0

파일 보기

파일: w2v.py 프로젝트: Sanqiang/entity2vector

class W2V:
    def __init__(self, path):
        self.stemmer = PorterStemmer()
        self.path =  path

        self.cur_idx = 0
        self.batch = 2
        self.sample = 0.001
        self.vocab_size = 10000

        self.total_count = 0
        self.word_count = Counter()
        self.word2idx = defaultdict(int)
        self.idx2word = {}
        self.word_sample = {}

        self.batch_size = 128
        self.embedding_size = 128  # Dimension of the embedding vector.
        self.skip_window = 3  # How many words to consider left and right.
        self.raw_sample_probs = [0.5, 0.3, 0.2]
        self.sample_probs = []
        sum = 0
        for prob in self.raw_sample_probs:
            sum += prob
            self.sample_probs.append(sum)
        self.num_skips = 2  # How many times to reuse an input to generate a label.

        self.valid_size = 16  # Random set of words to evaluate similarity on.
        self.valid_window = 100  # Only pick dev samples in the head of the distribution.
        #self.valid_examples = np.random.choice(valid_window, valid_size, replace=False)
        self.num_negative_sampled = 64  # Number of negative examples to sample.

        self.batch_index = 0
        self.get_stat()

    def get_stat(self):
        import json
        import collections
        import os.path

        filename = "stat"
        if os.path.exists(filename):
            f = open(filename, 'rb')
            obj = pickle.load(f)
            self.word2idx = obj["word2idx"]
            self.idx2word = obj["idx2word"]
            self.word_count = obj["word_count"]
            self.word_sample = obj["word_sample"]
            self.total_count = obj["total_count"]
            return

        line_idx = 0
        text_data = ""

        with open(self.path, "r") as ins:
            for line in ins:
                obj = json.loads(line)
                reviewText = obj["reviewText"]
                summary = obj["summary"]
                reviewerID = obj["reviewerID"]
                overall = obj["overall"]
                asin = obj["asin"]
                text_data = " ".join([text_data ,reviewText, summary])

                line_idx += 1
                if line_idx % 1000 == 0:
                    self.word_count + collections.Counter(self.stemmer.get_stem_wordlist(text_data.split()))
                    text_data = ""

        self.word_count = self.word_count.most_common(self.vocab_size-1)

        for word,cnt in self.word_count:
            self.word2idx[word] = 1+len(self.word2idx)
        #self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys()))

        #calculate the sample
        #threshold_count = self.sample * self.total_count
        #for word in self.word_count:
        #    word_probability = (sqrt(self.word_count[word] / threshold_count) + 1) * (threshold_count / self.word_count[word])
        #    self.word_sample[word] = int(round(word_probability * 2**32))
        f = open(filename, 'wb')
        pickle.dump({"word2idx":self.word2idx,"idx2word":self.idx2word,"word_count":self.word_count,"word_sample":self.word_sample,"total_count":self.total_count},f)

    def get_batch(self):


        #global batch_index

        #calculate sample prob inside window
        #sample_probs = []
        #sum = (1<<(self.skip_window)) - 1
        #prob = 0
        #for idx in range(self.skip_window,0,-1):
        #    prob += (1<<(idx-1)) / sum
        #    sample_probs.append(prob)

        #prepare buffer
        span = 2 * self.skip_window + 1  # [ skip_window target skip_window ]
        buffer = collections.deque(maxlen=span)

        context_data = [] #list append is better than numpy append, so using list append first and then convert into numpy obj
        target_data = [] # same as above
        for i in range(1, self.batch_size):
            idx = self.batch_index + i
            #line = linecache.getline(self.path, idx)
            line = '{ "reviewerID": "A2SUAM1J3GNN3B", "asin": "0000013714", "reviewerName": "J. McDonald", "helpful": [2, 3], "reviewText": "I bought this for my husband who plays the piano. He is having a wonderful time playing these old hymns. The music is at times hard to read because we think the book was published for singing from more than playing from. Great purchase though!", "overall": 5.0, "summary": "Heavenly Highway Hymns", "unixReviewTime": 1252800000, "reviewTime": "09 13, 2009" }'

            if line is None or len(line) == 0:
                print("current idx,", idx, " current batch_idx, ", self.batch_index, " line: ", line)
                continue

            obj = json.loads(line)
            reviewText = obj["reviewText"]
            summary = obj["summary"]
            reviewerID = obj["reviewerID"]
            overall = obj["overall"]
            asin = obj["asin"]
            text_data = " ".join([reviewText, summary]).split()

            for word_idx in range(0, len(text_data)):
                while len(buffer) < span:
                    buffer.append(self.word2idx[self.stemmer.get_stem_word(text_data[word_idx])])

                target_word = self.word2idx[buffer[self.skip_window]]
                context_word = target_word
                avoid_context_word = [target_word]
                r = rd.random()
                for cnt_idx in range(0, int(self.skip_window/2)): #random pick up skip_window/2 context word
                    while context_word in avoid_context_word: #for avoid repeat sample
                        for rank_idx in range(0,self.skip_window): #from closest to farest
                            if r <= self.sample_probs[rank_idx]:
                                if rd.random() >= 0.5:
                                    context_word = self.word2idx[buffer[self.skip_window - (rank_idx + 1)]]
                                else:
                                    context_word = self.word2idx[buffer[self.skip_window + (rank_idx + 1)]]
                                break
                    if context_word not in avoid_context_word:
                        avoid_context_word.append(context_word)
                        context_data.append(context_word)
                        target_data.append(target_word)

                #for next word
                buffer.append(self.stemmer.get_stem_word(text_data[word_idx]))

        #update global batch_index
        self.batch_index += self.batch_size

        context_data = np.ndarray(shape=(len(context_data)), dtype=np.int32)
        target_data = np.ndarray(shape=(len(target_data), 1), dtype=np.int32)
        return context_data, target_data

    def train(self):
        embedding_size = 128  # Dimension of the embedding vector.
        graph = tf.Graph()

        with graph.as_default():

            # Input data.
            train_inputs = tf.placeholder(tf.int32)
            train_labels = tf.placeholder(tf.int32)

            # Ops and variables pinned to the CPU because of missing GPU implementation
            with tf.device('/cpu:0'):
                # Look up embeddings for inputs.
                embeddings = tf.Variable(
                    tf.random_uniform([self.vocab_size, embedding_size], -1.0, 1.0), name = "emb")
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)

                # Construct the variables for the NCE loss
                nce_weights = tf.Variable(
                    tf.truncated_normal([self.vocab_size, embedding_size],
                                        stddev=1.0 / sqrt(embedding_size)))
                nce_biases = tf.Variable(tf.zeros([self.vocab_size]))

            # Compute the average NCE loss for the batch.
            # tf.nce_loss automatically draws a new sample of the negative labels each
            # time we evaluate the loss.
            loss = tf.reduce_mean(
                tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                               self.num_negative_sampled, self.vocab_size))

            # Construct the SGD optimizer using a learning rate of 1.0.
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

            saver = tf.train.Saver()

            init = tf.initialize_all_variables()

        self.num_steps = 1000000




        with tf.Session(graph=graph) as session:
            # We must initialize all variables before we use them.
            init.run()
            print("Initialized")

            average_loss = 0
            for step in range(self.num_steps):

                batch_inputs, batch_labels = self.get_batch()
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

                # We perform one update step by evaluating the optimizer op (including it
                # in the list of returned values for session.run()
                _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss += loss_val

                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 50
                    # The average loss is an estimate of the loss over the last 2000 batches.
                    print("Average loss at step ", step, ": ", average_loss)
                    filename = "_".join(["embedding",str(step)])
                    saver.save(session, filename)
                    average_loss = 0

        return

예제 #40

0

파일 보기

	iter_ = regexObject.finditer(text)
	for result in iter_:
		if result.group('id') is not None:
			docs.append(Doc(int(result.group('id')))) # add new doc to the `list` with the id
		else:
			# as far as we know, not both of these can be `not none` at the same time
			if result.group('title') is not None:
				docs[-1].appendText(result.group('title').lower() + '\n') # append to the last element docs `list`
			if result.group('body') is not None:
				docs[-1].appendText(result.group('body').lower()) # append to the last element docs `list`
	return docs


stopWords = {'a', 'all', 'an', 'and', 'any', 'are', 'as', 'be', 'been', 'but', 'by ', 'few', 'for', 'have', 'he', 'her', 'here', 'him', 'his', 'how', 'i', 'in', 'is', 'it', 'its', 'many', 'me', 'my', 'none', 'of', 'on ', 'or', 'our', 'she', 'some', 'the', 'their', 'them', 'there', 'they', 'that ', 'this', 'us', 'was', 'what', 'when', 'where', 'which', 'who', 'why', 'will', 'with', 'you', 'your'}

portertStemmer = PorterStemmer()


ro= re.compile(r'\d+\.+\d+|\w+')
""" regex object for finding words and digits and floats containing dot IN them pre-cooked ready to eat"""

def regexStyleGenerator(text):
	"""
	 A generator that returns tokens of given text
	 makes stopword removel and stemming
	:param text: the text to tokenize
	:return: None but yields tokens
	"""
	global  total_num_words_before,total_num_words_after # for statistics

예제 #41

0

파일 보기

def run_train_test(training_file, testing_file):

    # Set the variables, params, dicts, sets
    alpha = 0.5

    stop_words = {'the', 'and'}
    logic_negation = {'t', 'not', 'no', 'never', 'dont', 'didnt', 'doesnt'}
    Porter_Stemmer = PorterStemmer()

    # Import training dataset
    training_start_time = time.time()
    vocab = set(['positive-words', 'negative-words'])
    wordcount_class_0 = {'positive-words': 0, 'negative-words': 0}
    wordcount_class_1 = {'positive-words': 0, 'negative-words': 0}
    total_reviews = 0
    reviewscount_0 = 0
    reviewscount_1 = 0
    train_labels = []
    train_reviews = []
    with training_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            total_reviews += 1

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)

            # words = set(words)
            vocab.update(words)

            for word in words:
                if word not in wordcount_class_0.keys():
                    wordcount_class_0[word] = 0
                    wordcount_class_1[word] = 0

            if label == 0:
                reviewscount_0 += 1
                for word in words:
                    wordcount_class_0[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            if label == 1:
                reviewscount_1 += 1
                for word in words:
                    wordcount_class_1[word] += 1
                    # # Analyze Sentiment lexicons
                    # unigram1, unigram2 = word.split(' ')
                    # if unigram1 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram1 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1
                    # if unigram2 in lexicons.pos_words:
                    #     wordcount_class_0['positive-words'] += 1
                    # if unigram2 in lexicons.neg_words:
                    #     wordcount_class_0['negative-words'] += 1

            train_labels.append(label)
            train_reviews.append(words)

    # Compute CPTs
    P_class = [0, 0]
    P_class[0] = reviewscount_0 / total_reviews
    P_class[1] = reviewscount_1 / total_reviews

    P_words_class_0 = {}
    P_words_class_1 = {}
    bottom_0 = sum(wordcount_class_0.values()) + alpha * len(vocab)
    bottom_1 = sum(wordcount_class_1.values()) + alpha * len(vocab)
    for word in vocab:
        if word in stop_words:
            P_words_class_0[word] = (0 + alpha) / bottom_0
            P_words_class_1[word] = (0 + alpha) / bottom_1
        else:
            P_words_class_0[word] = (wordcount_class_0[word] +
                                     alpha) / bottom_0
            P_words_class_1[word] = (wordcount_class_1[word] +
                                     alpha) / bottom_1

    # Inference on the training dataset
    predict_train_labels = []
    for doc in train_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])

            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])

        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        predict_train_labels.append(c)

    # Compute training accuracy
    correct = 0
    for i in range(len(train_labels)):
        if predict_train_labels[i] == train_labels[i]:
            correct += 1
    train_accuracy = correct / len(train_labels)

    training_time = time.time() - training_start_time

    # Import testing dataset
    testing_start_time = time.time()
    test_reviews = []
    test_labels = []
    with testing_file as f:
        for line in f:
            review, label = line.split(',')
            words = review.split(' ')
            del words[-1]
            label = int(label.strip("\n"))

            # Implement negation: add NOT_ to words after logical negation
            for i in range(len(words)):
                words[i] = Porter_Stemmer.stem(words[i])
                if words[i] in logic_negation:
                    try:
                        words[i + 1] = 'NOT_' + words[i + 1]
                    except:
                        continue
                    try:
                        words[i + 2] = 'NOT_' + words[i + 2]
                    except:
                        continue
                    try:
                        words[i + 3] = 'NOT_' + words[i + 3]
                    except:
                        continue

            bigrams = []
            for i in range(len(words) - 1):
                bigram = words[i] + ' ' + words[i + 1]
                bigrams.append(bigram)

            words = set(bigrams)
            # words = set(words)

            test_labels.append(label)
            test_reviews.append(words)

    # Inference on the testing dataset
    predict_test_labels = []
    for doc in test_reviews:
        log_sum_0 = 0
        log_sum_1 = 0
        bag_of_words = set(doc)
        bag_of_words = vocab.intersection(bag_of_words)
        for word in bag_of_words:
            log_sum_0 += log(P_words_class_0[word])
            log_sum_1 += log(P_words_class_1[word])
            # # Sentiment Analysis
            # unigram1, unigram2 = word.split(' ')
            # if unigram1 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram1 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
            # if unigram2 in lexicons.pos_words:
            #     log_sum_0 += log(P_words_class_0['positive-words'])
            #     log_sum_1 += log(P_words_class_1['positive-words'])
            # if unigram2 in lexicons.neg_words:
            #     log_sum_0 += log(P_words_class_0['negative-words'])
            #     log_sum_1 += log(P_words_class_1['negative-words'])
        Prob_c0 = log(P_class[0]) + log_sum_0
        Prob_c1 = log(P_class[1]) + log_sum_1
        if Prob_c0 > Prob_c1:
            c = 0
        else:
            c = 1
        # print(c)
        predict_test_labels.append(c)

    # Compute testing accuracy
    correct = 0
    for i in range(len(test_labels)):
        if predict_test_labels[i] == test_labels[i]:
            correct += 1
    test_accuracy = correct / len(test_labels)

    # Print results
    testing_time = time.time() - testing_start_time
    print(round(training_time), 'seconds (training)')
    print(round(testing_time), 'seconds (labeling)')
    print(round(train_accuracy, 3), '(training)')
    print(round(test_accuracy, 3), '(testing)')
    print(len(vocab))

    return

예제 #42

0

파일 보기

파일: githater.py 프로젝트: kristopolous/githater

    'browser' : 1
  },
  'file' : {
    'ls-time': 1
  },
  'ls' : {
    'ls-time': 1,
    'tag-time': 1
  },
  'refspec': {
    'new-branch-push': 2
  }
}

computed = {}
stemmer = PorterStemmer()
for word in sys.argv[1:]:
  word = stemmer.stem(word.lower())
  print word
  if word in synonymMap:
    word = synonymMap[word]

  if word in weightMap:
    for key, value in weightMap[word].iteritems():
      if key in computed:
        computed[key] += value
      else:
        computed[key] = value

sorted_computed = sorted(computed.iteritems(), key=operator.itemgetter(1))

예제 #43

0

파일 보기

파일: search.py 프로젝트: nelly-hateva/torchwood

from heapq import heappush, heappop
from collections import defaultdict
from stemmer import PorterStemmer
from pairwise import pairwise


here = lambda *x: os.path.abspath(os.path.join(os.path.dirname(__file__), *x))
stopwords_file = here('stopwords', )
index_file = here('index',)
lengths_file = here('lengths', )

top = 20
lambda_ = 0.5
query = sys.argv[1:]
inner_query = []
p = PorterStemmer()
index = {}
count_term = {}


start_time = time.time()
with open(stopwords_file, "r") as file:
    stopwords = map(lambda line: line.strip(), file.readlines())

with open(index_file, "r") as index_file:
    lines = index_file.readlines()
    for line in lines:
        entry = line.split(" ")
        documents = entry[2:]
        dictionary = defaultdict(int)
        for document, count in pairwise(documents):

예제 #44

0

파일 보기

파일: word.py 프로젝트: powerllamas/EZI

 def __init__(self, stopwords):
     self.stopwords = stopwords
     self.stemmer = PorterStemmer()

예제 #45

0

파일 보기

파일: tools.py 프로젝트: davedash/mealadvisor

def stem(word):
    p = PorterStemmer()

    return p.stem(word, 0, len(word)-1)

예제 #46

0

파일 보기

DATASETS = "datasets"
TRAINING = DATASETS + "/training.txt"
VALIDATION = DATASETS + "/validation.txt"

SPORTS = 2
POLITICS = 1

HASHTAGS = re.compile("\#([^\s\,\.\#\"\'\+\=\|\$\%\^\:]+)")
URLS1 = re.compile("https?\:\/\/([^\s]+)")
URLS2 = re.compile("www\.([^\s]+)")
REFS = re.compile("\@([^\s\,\.\#\"\'\+\=\|\$\%\^\:\-]+)")
KEYWORDS = re.compile("(\w+)")
WHITESPACE = re.compile(
    "[\s\.\,\'\"\[\]\{\}\;\:\/\&\=\+\-\)\(\*\&\^\%\$\`\|\?\!]+")
STEMMER = PorterStemmer()

idtable = {}


def get_id(table, key, write=True):
    if table.has_key(key): return table[key]
    else:
        if write:
            table[key] = len(table)
            return table[key]
        else:
            return None


def register(table, keys, write=True):

예제 #47

0

파일 보기

파일: crawler.py 프로젝트: jstumbaugh/web_crawler

class Crawler:

    def __init__(self):
    	"""
    	Author: Nicole

    	This method declares the list stopwords, dictionaries all_words and
    	all_words_freq, as well as the PorterStemmer object.
    	"""
        self.stopwords = []
        self.p = PorterStemmer()
        self.all_words = {}
        self.all_words_freq = {}
        self.tfidf = {}
        self.vocabulary = []
        self.doc_term_matrix = [[0] * 23 for n in range(809)]
        self.docs = {}
        self.visited = []

    def clean_url(self, url) :
        """
        Author: Jason

        This method removes the base url
        EX. http://lyle.smu.edu/~fmoore/schedule.htm => schedule.htm
        """
        url = re.compile(_ROOT_).sub('', url)
        url = re.compile('http://lyle.smu.edu/~fmoore').sub('', url)
        url = re.compile('index.*').sub('', url)
        url = re.compile('.*.gif').sub('', url)
        return re.compile('\.\./').sub('', url)

    def fetch(self, url) :
        """
        Author: Jason

        This method will fetch the contents of the page.
        """
        r = requests.get(urlparse.urljoin(_ROOT_, self.clean_url(url)))
        return r.text

    def extract_urls(self, text) :
        """
        Author: Jason

        This method will take the contents of a page and extract all of the URLs on it
        """
        urls = []
        soup = BeautifulSoup(text, 'html.parser')
        for atag in soup.find_all('a'):
            urls.append(atag.get('href'))
        for img in soup.find_all('img'):
            urls.append(img.get('src'))
        return urls

    def external_link(self, url) :
        """
        Author: Jason

        This method will check if the URL is an external link outside the root domain
        """
        if url :
            url = re.compile('https*://').sub('', url)
            if re.compile('.*lyle.smu.edu/~fmoore.*').match(url) :
                return False
            elif re.compile('www.*').match(url) :
                return True
            elif re.compile('java-source.*').match(url) :
                return True
            elif re.compile('.*smu.edu.*').match(url) :
                return True
            elif re.compile('.*.aspx').match(url) :
                return True
            elif re.compile('mailto:').match(url) :
                return True
            elif re.compile('.*.xlsx').match(url) :
                return False
            elif requests.get(_ROOT_ + url).status_code == 200 :
                return False
            elif self.jpeg_link(url) :
                return False
            else :
                return True
        else :
            return True

    def jpeg_link(self, url) :
        """
        Author: Jason

        This method will check if the link is a JPEG
        """
        return True if re.compile('.*.jpg').match(url) else False

    def broken_link(self, url) :
        """
        Author: Jason

        This method will check if the link is broken.
        """
        return False if requests.get(_ROOT_ + self.clean_url(url)).status_code == 200 else True

    def excel_link(self,url) :
        """
        Author: Jason

        This method will check if the link is an excel file.
        """
        return True if re.compile('.*.xlsx').match(url) else False

    def add_root_to_links(self, urls) :
        """
        Author: Jason

        This method will add the root URL to all of the links for visual apperance
        """
        new_urls = [_ROOT_ + re.compile('http://lyle.smu.edu/~fmoore/').sub('', link) for link in urls]
        return new_urls

    def remove_extra_whitespace(self, text) :
        """
        Author: Nicole

        This method removes more than one white space between words.
        """
        p = re.compile(r'\s+')
        return p.sub(' ', text)

    def remove_punctuation(self, text) :
        """
        Author: Nicole

        This method uses regex to remove the punctuation in text.
        http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
        """
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def load_stop_words(self, file) :
        """
        Author: Nicole

        This method stores the list of stopwords from a file to the class
        variable list self.stopwords.
        """
        self.stopwords = [line.rstrip(' \n').lower() for line in open(file)]

    def prepare_text(self, text) :
        """
        Author: Nicole

        This method prepares the raw HTML text for it to be indexed by lowering
        the letters, removing the HTML tags, removing the punctuation, removing
        the extra white space, changing the list to ASCII from unicode, removing
        the stop words, and stemming each word.
        """
        text = strip_tags(text.lower())
        text = self.remove_punctuation(text)
        text = self.remove_extra_whitespace(text)
        text = [word.encode('UTF8') for word in text.split()]
        text = [word for word in text if word not in self.stopwords]
        text = self.p.stem_word(text)
        return text

    def add_root_if_not_there(self, url) :
        """
        Author: Jason

        This method will add the root url to a single link if it isnt there
        """
        url = re.compile('http://lyle.smu.edu/~fmoore/').sub('', url)
        return _ROOT_ + url

    def index(self, url, doc_words) :
        """
        Author: Nicole

        This method indexes all the words in a document and keeps track of
        the frequency of a word in overall documents and overall occurrences.
        """
        for key in doc_words :
            if key not in self.all_words:
                self.all_words[key] = [(url, doc_words[key])]
                self.all_words_freq[key] = [1, doc_words[key]]
                self.vocabulary.append(key)
                # [word][docID] = word_freq
                # print '['+str(self.vocabulary.index(key))+']['+str(self.docs[self.add_root_if_not_there(url)])+'] = ' + str(self.all_words[key][0][1])
                self.doc_term_matrix[self.vocabulary.index(key)][self.docs[self.add_root_if_not_there(url)]] = self.all_words[key][0][1]
            else:
                self.all_words[key].append((url, doc_words[key]))
                self.all_words_freq[key][0] += 1
                self.all_words_freq[key][1] += doc_words[key]
                for tup in self.all_words[key] :
                    if tup[0] == str(url) :
                        self.doc_term_matrix[self.vocabulary.index(key)][self.docs[self.add_root_if_not_there(url)]] = tup[1]

    def calTFIDF(self, word, visited) :
        """
        Author: Nicole

        This method will calculate the TF-IDF for a given word.

        1 + log(number of times word appears in a document) * log(total documents/ how many documents the word appears in)
        """
        if word in self.all_words:
            for i in self.all_words[word] :
                return (1 + log10(i[1])) * log10(len(visited)/self.all_words_freq[word][0])
        else :
            return 0

    def write_output(self, visited, external, jpeg, broken, dictionary) :
        """
        Author: Jason and Nicole but mostly Jason except for lines 211 - 213

        This method will write the output of the crawler and the 20 most common words to output.txt
        """
        dictionary = sorted(dictionary.items(), key=lambda e: e[1][1], reverse=True)[:20]

        f = open('output.txt', 'w')
        f.write('Output of Jason and Nicole\'s web crawler.\n')
        f.write('Current Time: ')
        f.write(strftime("%Y-%m-%d %H:%M:%S", localtime()))
        f.write('\n\n')

        # Visited links
        f.write('Visited Links: (' + str(len(visited)) + ' total)\n')
        for link in visited :
            f.write(link + '\n')
        f.write('\n')

        # External links
        f.write('External Links: (' + str(len(external)) + ' total)\n')
        for link in external :
            f.write(link + '\n')
        f.write('\n')

        # JPEG links
        f.write('JPEG Links: (' + str(len(jpeg)) + ' total)\n')
        for link in jpeg :
            f.write(link + '\n')
        f.write('\n')

        # Broken links
        f.write('Broken Links: (' + str(len(broken)) + ' total)\n')
        for link in broken :
            f.write(link + '\n')
        f.write('\n')

        # Term Frequency
        f.write('Top 20 Most Common Words with Document Frequency\n')
        for i in dictionary:
            f.write('The term ' + i[0] + ' occurs ' + str(i[1][1]) + ' times in ' + str(i[1][0]) + ' documents.\n')

        f.close()
        f = open('term_document_frequency_matrix.txt', 'w')
        f.write('Term/Document Frequency Matrix for Jason and Nicole\'s web crawler.\n')
        f.write('Current Time: ')
        f.write(strftime("%Y-%m-%d %H:%M:%S", localtime()))
        f.write('\n\n               ') # 15 spaces
        for key, val in self.docs.iteritems() :
            f.write('{0:60}'.format(key))
        f.write('\n')
        for i in range(0,len(self.vocabulary)) :
            f.write('{0:15}'.format(self.vocabulary[i]))
            for j in range(0,23) :
                f.write('{}'.format(self.appears(self.doc_term_matrix[i][j])).ljust(60))
            f.write('\n')
        f.close()

    def appears(self, i) :
        """
        Author: Jason

        This method will return 1 if the frequency (i) is greater than 1. It is
        used for writing the term/document frequency matrix
        """
        if i >= 1 :
            return 1
        else:
            return 0

    def clean_external_links(self, external) :
        """
        Author: Jason

        This method will cremove the non links from the external links
        """
        urls = []
        for link in external :
            if link == None :
                return urls
            else :
                urls.append(link)
        return urls

    def query_engine(self, N):
        """
        Author: Jason

        This method will be the main query handler.
        self.all_words format (var info below): [('spring'), [('url', 3), ('other_page', 4)] ]
                                                  word         tuples(url, frequency)
        """
        print "#################################################################"
        print "################ Jason and Nicoles' Web Crawler #################"
        print "#################################################################"
        print
        print "Please enter a query to search the lyle.smu.edu/~fmoore domain."
        print "Search will display top " + str(N) + " results or all results that query is found on."
        print "Type 'quit' to exit the search engine"
        user_input = ""
        while True :
            user_input = raw_input("> ")
            if user_input == "quit" or user_input == "Quit" or user_input == "QUIT":
                break
            query = self.p.stem_word(re.sub("[^\w]", " ",  user_input).split())
            query = [word.lower() for word in query]
            for word in query :
                if word in self.stopwords :
                    query.remove(word)
            query_vector = [self.calTFIDF(word, self.visited) for word in query]
            docs = {}
            for doc_name, ID in self.docs.iteritems() :
                vector = []
                for word in query :
                    if word in self.vocabulary :
                        if self.doc_term_matrix[self.vocabulary.index(word)][self.docs[self.add_root_if_not_there(doc_name)]] >= 1 :
                            vector.append(1)
                        else :
                            vector.append(0)
                docs[doc_name] = self.normalize_vector(vector)
            rankings = {}
            for url, doc_vec in docs.iteritems() :
                rankings[url] = self.calculate_cosine_similarity(doc_vec, query_vector)

            sorted_rankings = sorted(rankings.items(), key=operator.itemgetter(1), reverse=True)
            i = 0
            if sorted_rankings[0][1] == 0.0 :
                print '%s not found in domain.\n' % user_input
                continue
            print '  Score:      Document:'
            while i < N :
                if sorted_rankings[i][1] == 0.0 :
                    break
                print '   {0:4f}'.format(sorted_rankings[i][1]) + '    {}'.format(sorted_rankings[i][0])
                i += 1
            print
        return

    def normalize_vector(self, vector) :
        """
        Author: Jason

        This method will nomalize the vector to prep for calculate_cosine_similarity
        """
        if numpy.linalg.norm(vector) == 0.0 :
            return [0.0 for i in vector]
        else :
            return [i / numpy.linalg.norm(vector) for i in vector]

    def calculate_cosine_similarity(self, doc, query) :
        """
        Author: Jason

        This method will calculate the cosine similarity betwee two vectors of equal size
        """
        if len(doc) != len(query) :
            return 0.0
        return numpy.dot(doc,query)

    def crawl(self, pages_to_index) :
        """
        Author: Jason and Nicole

        This is the main worker method. It will parse the urls, add the words to
        the index, get the next links, and continue looping through the queue until
        the number of pages to index is met.
        """

        parser = robotparser.RobotFileParser()
        parser.set_url(urlparse.urljoin(_ROOT_, 'robots.txt'))
        parser.read()

        # Add _ROOT_ url to queue
        urlqueue = [_ROOT_]

        # visited, external, jpeg, and broken links
        visited, external, jpeg, broken = [], [], [], []

        # pages indexd
        pages_indexed = 0

        while urlqueue:
            # get flast element in urlqueue
            url = urlqueue.pop(-1)
            if self.clean_url(url) in visited:
                continue

            # check if we can fetch the page first
            if parser.can_fetch('*', urlparse.urljoin('/', url)) :

                # remove the / at the beginning of the string
                url = re.compile('^/').sub('',url)

                # fetch the page
                page = self.fetch(url)

                # add page to visited links
                visited.append(self.clean_url(url))

                # get urls from page
                new_urls = self.extract_urls(page)

                for new_url in new_urls :
                    # check if we have already visited it or are going to
                    if new_url not in visited and new_url not in urlqueue and new_url not in jpeg and new_url not in broken and new_url not in external :
                        if self.external_link(new_url) :
                            external.append(new_url)
                        elif self.jpeg_link(new_url) :
                            jpeg.append(new_url)
                        elif self.excel_link(new_url) :
                            visited.append(new_url)
                        elif self.broken_link(new_url) :
                            broken.append(new_url)
                        else :
                            urlqueue.append(new_url)

                # docs and page id
                self.docs[self.add_root_if_not_there(url)] = pages_indexed

                # checks to see if url is parsable aka .html, .htm, .txt
                # if yes, then parse and index; if no, pass
                filename, file_extension = os.path.splitext(url)
                if not (file_extension == ".pdf" or file_extension == ".pptx") :
                    pagetext = requests.get(_ROOT_ + self.clean_url(url))
                    pagetext = pagetext.text
                    cleantext = self.prepare_text(pagetext)
                    doc_words = Counter(cleantext)
                    self.index(url, doc_words)

                # increment the pages indexed
                pages_indexed += 1
                if int(pages_indexed) >= int(pages_to_index):
                    break
            # end if
        # end while

        # clean the links for visual appearance
        visited = set(self.add_root_to_links(visited))
        self.visited = visited
        jpeg = self.add_root_to_links(jpeg)
        broken = self.add_root_to_links(broken)
        external = self.clean_external_links(external)

        # write to output file
        self.write_output(visited, external, jpeg, broken, self.all_words_freq)

        # query engine with N=5
        self.query_engine(5)

예제 #48

0

파일 보기

파일: RawFreq.py 프로젝트: yipeiw/twitter

#!/usr/bin/env python

import sys
sys.path.append('/home/yipei/Twitter/FeatureExtraction/code/util')

from collections import defaultdict
import os.path as path
from stemmer import PorterStemmer
import re

import TwitterParser as Tparse

stemmer = PorterStemmer()

filelist = sys.argv[1]
outputdir = sys.argv[2]

for line in open(filelist):
    line = line.strip()
    fin = open(line, 'r')
    clip = path.basename(line).split(".")[0]
    print "process ", clip 

    boffile = path.join(outputdir, clip+'.bof')
    fout=open(boffile, 'w')
    
    wordDict=defaultdict(int)
    wordDict.clear()

    #go through all tweets and count for the number of each term
    while True:

예제 #49

0

파일 보기

파일: help.py 프로젝트: B-Rich/ibid-1

 def __init__(self, language):
     PorterStemmer.__init__(self)

예제 #50

0

파일 보기

def GetDataset():
    np.random.shuffle(emails)

    x_vals = []
    y_vals = []

    stemmer = PorterStemmer()
    word_mapping = GetWordDictionary(emails)

    i = 0
    text_data = []
    for i in range(0, len(emails)):
        #print "Evaluation Email %d" % (i)
        # note: time diff in mins
        email, next_email, time_diff, label = emails[i]

        # Create feature array
        features = []

        #Feature 1: Number of to
        features.append(len(email['to']))

        # Feature 2: Num words
        words = email['body'].split()
        lower_case_body = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in words
        ]
        features.append(len(words))

        # Feature 3: Number of CC
        features.append(email['cc_count'])

        # Feature 4: is reply
        if email['is_re']:
            features.append(1)
        else:
            features.append(0)

        # Feature 5: Time of Day (hour)
        date = email['date']['local_date']
        hour = date.hour
        features.append(hour)

        # Feature 6: Length of Subject Line
        subject_words = email['subject'].split()
        lower_case_subject = [
            stemmer.stem(x.lower(), 0,
                         len(x) - 1) for x in subject_words
        ]
        features.append(len(subject_words))

        # Feature 7: Day of Week
        features.append(date.weekday())

        # Feature 8: # Question marks in Body, bool in body
        features.append(email['body'].count('?'))

        # Feature 9: # Question marks in Subject, bool in subject
        features.append(email['subject'].count('?'))

        # NEW FEATURES

        # boolean: presence of ? in body / header
        features.append(1 if '?' in email['body'] else 0)
        features.append(1 if '?' in email['subject'] else 0)

        # Feature 12-13: "RESPONSE NEEDED" in subject or body
        keywords = ['response', 'please', 'can', 'urgent', 'important', 'need']
        for keyword in keywords:
            stemmed_keyword = stemmer.stem(keyword, 0, len(keyword) - 1)
            features.append(1 if stemmed_keyword in lower_case_subject else 0)
            features.append(1 if stemmed_keyword in lower_case_body else 0)

        x_vals.append(features)
        y_vals.append(label)

    X = np.array(x_vals)
    Y = np.array(y_vals)
    return X, Y

예제 #51

0

파일 보기

파일: parser.py 프로젝트: nelly-hateva/torchwood

import glob
import os
import re
import string
from bs4 import BeautifulSoup
from stemmer import PorterStemmer


here = lambda *x: os.path.abspath(os.path.join(os.path.dirname(__file__), *x))
data_dir = here('data', )
corpus_dir = here('corpus', )

total_files = 0
p = PorterStemmer()

if not os.path.exists(corpus_dir):
    os.makedirs(corpus_dir)
os.chdir(data_dir)

for file in glob.glob('*.sgm'):
    current_file = os.path.join(data_dir, file)
    print 'Extract files from file %s' % current_file
    soup = BeautifulSoup(open(current_file))
    for document in soup.find_all('reuters'):
        new_file = os.path.join(corpus_dir, document.get('newid'))
        with open(new_file, "wb") as extracted_file:
            read_data = document.get_text().encode('utf-8')
            clean_data = re.sub(r'/', ' / ', read_data)
            clean_data = re.sub(r'-', ' - ', clean_data)
            """
            The punctuations contained in the string.punctuation are