Пример #1
0
    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)
Пример #2
0
def write_clean_turian_unigrams():
    """
    Extracts unigram embeddings from Socher's binary distribution. These can be used by other composers.

    There are only 50k embeddings (presumably for the most frequent tokens in the corpus). The words have not
    been processed- there are punctuation-only tokens, uppercased words and non-lemmatized words. There isn't
    any PoS tag filtering either- words like "to", "while" and "there".

    I remove punctuation, then lowercase and lemmatize each entry. Multiple entries may map to the
    same canonical form. I select the shortest original entry (ties are broken by giving preference to
    words that are already lowercased). This could have been done better.
    Only vectors for the selected entries are kept. There's 33k canonical
    forms left, many of which are not nouns/adjs/verbs.

    We don't have a PoS tag for the canonical forms. I get around the problem by creating 3 copies of each
    canonical form and expand "cat" to cat/N, cat/J and cat/V, which all share the same vector.
    """
    logging.info('Writing Turian unigrams to %s', turian_unigram_vectors_file)
    mat = loadmat(socher_unigram_embedding_matlab)
    words = [w[0] for w in mat['words'].ravel()]
    df = pd.DataFrame(mat['We'].T, index=words)

    lmtzr = WordNetLemmatizer()
    clean_to_dirty = defaultdict(list)  # canonical -> [non-canonical]
    dirty_to_clean = dict()  # non-canonical -> canonical
    to_keep = set()  # which non-canonical forms forms we will keep
    #  todo this can be done based on frequency or something

    for w in words:
        if set(w).intersection(set(string.punctuation).union(set('0123456789'))):
            # not a real word- contains digits or punctuation
            continue

        lemma = lmtzr.lemmatize(w.lower())
        clean_to_dirty[lemma].append(w)
        dirty_to_clean[w] = lemma

    # decide which of possibly many non-canonical forms with the same lemma to keep
    # prefer shorter and lowercased non-canonical forms
    for lemma, dirty_list in clean_to_dirty.items():
        if len(dirty_list) > 1:
            best_lemma = min(dirty_list, key=lambda w: (len(w), not w.islower()))
        else:
            best_lemma = dirty_list[0]
        to_keep.add(best_lemma)

    # remove non-canonical forms we don't want
    idx_to_drop = [i for i, w in enumerate(df.index) if w not in to_keep]
    ddf = df.drop(df.index[idx_to_drop])
    # canonicalize whatever is left
    ddf.index = [lmtzr.lemmatize(w.lower()) for w in ddf.index]

    # we don't know what the PoS tags of the canonical forms are, so make them all of the same tag
    # e.g. expand "cat" to cat/N, cat/J and cat/V, which all share the same vector
    new_index = ['%s/%s'%(w, pos) for pos in 'NJV' for w in ddf.index]
    new_data = np.vstack([ddf.values] * 3)
    ddf = pd.DataFrame(new_data, index= new_index)
    dv = DenseVectors(ddf, allow_lexical_overlap=True)
    dv.to_tsv(turian_unigram_vectors_file)
    logging.info('Done')
	def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)
def lemmatize(tokens): 
	# lemmatize words. try both noun and verb lemmatizations 
	lmtzr = WordNetLemmatizer() 
	for i in range(0,len(tokens)): 
		res = lmtzr.lemmatize(tokens[i]) 
		if res == tokens[i]: 
			tokens[i] = lmtzr.lemmatize(tokens[i], 'v') 
		else: 
			tokens[i] = res 
	return tokens
class Lemmatizer():
	def __init__(self):
		self.lemmatizer = WordNetLemmatizer()
		self.stemmer = SnowballStemmer("english", ignore_stopwords=True)

	'''
	Lemmatizes every word in a sentence and then tokenizes it.	
		sentence: str
	'''
	def lemmatize(self, sentence):
		tokens = word_tokenize(sentence)
		lemmas = self.lemmatizeTokens(tokens)
		return " ".join(lemmas)
		
	'''
	Turns phrase tokens into lemmatized tokens, which means into some standard format
	as determined by the nltk lemmatizer. "Dogs" to "dog", "went" to "go", etc.	 
		tokens: list of str
	'''
	def lemmatizeTokens(self, tokens):
		tokens_tagged = pos_tag(tokens)
		#Get simple POS tags.
		tokens_simpleTags = [(word, map_tag('en-ptb', 'universal', tag)) 
			for word, tag in tokens_tagged]
		
		#Actually lemmatize.
		lemmas = []
		for token, tag in tokens_simpleTags:
			lemmatized = ""
			if tag == "VERB":
				lemmatized = self.lemmatizer.lemmatize(token, pos='v')
			elif tag == "ADJ":
				lemmatized = self.lemmatizer.lemmatize(token, pos='a')
			elif tag == "ADV":
				lemmatized = self.lemmatizer.lemmatize(token, pos='r')
			else:
				lemmatized = self.lemmatizer.lemmatize(token) #pos = 'n'
			lemmas.append(lemmatized.encode("utf-8"))
		return lemmas

	'''
	Reduce this word down to its most basic form by removing suffixes or common ending
	and finding the "root" or "stem" of the word.

	Example: "response," "responsive," and "responsivity" all stem from "respons," or 
	something similar.
	'''
	def stem(self, tokens):
		stemmed = []
		for token in tokens:
			stem = self.stemmer.stem(token)
			stemmed.append(stem.encode("utf-8"))
		return stemmed
def process_data(sentence):
    #Reference: http://stackoverflow.com/questions/20827741/nltk-naivebayesclassifier-training-for-sentiment-analysis
    #Reference: https://blog.cambridgecoding.com/2016/01/25/implementing-your-own-spam-filter/
    lemmatizer = WordNetLemmatizer()
    return [
        lemmatizer.lemmatize(word.lower()) for word in word_tokenize(sentence)
    ]
Пример #7
0
def tokenize(data_list):
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords

    tt = TweetTokenizer()
    stopwords = set(stopwords.words('english'))
    climate_list = []
    for d in data_list:
        d = str(d)
        data_dict = {}
        list_words = tt.tokenize(d)
        list_words = [
            w.lower() for w in list_words if isinstance(w, str) == True
        ]
        list_words = [w for w in list_words if w.isalpha()]
        filtered_words = [w for w in list_words if not w in stopwords]
        wnl = WordNetLemmatizer()
        filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words]
        filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words]
        stemmer = PorterStemmer()
        filtered_words = [stemmer.stem(w) for w in filtered_words]
        for i in filtered_words:
            key = data_dict.get(i)
            if key == None:
                data_dict[i] = 1
            else:
                data_dict[i] += 1
        climate_list.append(data_dict)
    return climate_list
Пример #8
0
    def _process_descr_text(self):
        # Preprocessing of text data

        # Convert all the string to lower cases
        proc_text = self.descr_text.lower()

        # \S+ means anything that is not an empty space
        proc_text = re.sub('http\S*', '', proc_text)

        # \s+ means all empty space (\n, \r, \t)
        proc_text = re.sub('\s+', ' ', proc_text)
        proc_text = re.sub('[^\w\s]', '', proc_text)

        # Adding domain-based stop words to general English stop words list and ignoring these in data
        stop = stopwords.words('english') + ["festival", "event", "festiv", "day", "week", "month", "year", "much"\
                                            "feature", "celebration", "celebrate", "featuring", "featurin", "include", \
                                            "weekend", "event", "featuring", "enjoy", "fest", "cotopaxi", "questival", \
                                            "around", "best", "including", "great", "first", "come", "throughout", "area", \
                                            "festivals", "events", "fairs", "days", "celebrations", "fests", "includes", \
                                            "features", "celebrating", "areas"]

        proc_text = " ".join(word for word in proc_text.split()
                             if word not in stop)

        # Tokenizes and lemmatizes words
        proc_text = word_tokenize(proc_text)
        lemztr = WordNetLemmatizer()
        proc_text = ' '.join([lemztr.lemmatize(word) for word in proc_text])

        #self.proc_text = proc_text
        return proc_text
Пример #9
0
class TextAnalyser(object):
    def __init__(self):
        self.threshold = 0.99
        self.__rake = Rake()
        self.__stemmer = LancasterStemmer()
        self.__lemma = WordNetLemmatizer()
        self.__stopwords = ['alt']
        pass

    def extract(self, text):
        self.__rake.extract_keywords_from_text(text.strip())
        scores = self.__rake.get_ranked_phrases_with_scores()
        keywords = self.unpack_keywords(scores)
        words = filter(lambda x: x[1] not in self.__stopwords and x[1].isalnum(), keywords)

        filtered_words = map(lambda x: x[1], filter(lambda x: x[0] > self.threshold, words))

        lemms = map(lambda x: self.__lemma.lemmatize(x), filtered_words)
        stems = map(lambda x: self.__stemmer.stem(x), lemms)

        return stems

    @staticmethod
    def unpack_keywords(keywords):
        words = []

        for k in keywords:
            for p in k[1].split(' '):
                words.append((k[0], p))

        return words
Пример #10
0
def data_cleaning(data):

    data["essay"] = data["essay"].str.lower()

    # 分词
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    data["essay_token"] = data["essay"].apply(tokenizer.tokenize)

    # 去停用词
    # stop_words = stopwords.words('english')
    # data["essay_token"] = data["essay_token"].apply(lambda x: [word for word in x if word not in stop_words])

    # 词形还原
    lemmatizer = WordNetLemmatizer()
    data["essay_token"] = data["essay_token"].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # 词干化
    stemmer = nltk.stem.SnowballStemmer('english')
    data["essay_token"] = data["essay_token"].apply(lambda x: [stemmer.stem(word) for word in x])
    data = data.fillna(6)
    essays = []
    for essay, score in zip(data['essay_token'], data['score']):
        essays.append((' '.join(essay), int(score)))

    return essays
Пример #11
0
def preprocess(text_data):
    nltk.download("wordnet")
    processed_text = []
    word_lemmatizer = WordNetLemmatizer()

    url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    user_pattern = "@[^\s]+"
    alpha_pattern = "[^a-zA-Z0-9]"
    sequence_pattern = r"(.)\1\1+"
    seq_replace_pattern = r"\1\1"

    for tweet in text_data:
        tweet = tweet.lower()

        # Replace all Urls with 'URL'
        tweet = re.sub(url_pattern, ' URL', tweet)
        # Replace all emojis
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])

        tweet = re.sub(user_pattern, ' USER', tweet)
        tweet = re.sub(alpha_pattern, " ", tweet)
        tweet = re.sub(sequence_pattern, seq_replace_pattern, tweet)

        tweet_words = ''
        for word in tweet.split():
            if len(word) > 1:
                word = word_lemmatizer.lemmatize(word)
                tweet_words += (word + '')

        processed_text.append(tweet_words)

    return processed_text
Пример #12
0
def lemma():
    file1 = open('inputtext.txt', 'r')
    text = file1.readline()
    lem = []
    tags = []
    data = ""
    while text != "":
        data = data + text
        tags = tags + pos_tag(wordpunct_tokenize(text))
        # lem = WordNetLemmatizer().lemmatize((word for word in tokens), (pos for word,pos in tags))
        for word, pos in tags:
            lem.append(WordNetLemmatizer.lemmatize(word, pos))
        text = file1.readline()

    # bi-gram
    bgram = {}
    ngram = ngrams(data.split(), 2)
    for item in ngram:
        if item in bgram:
            bgram[item] += 1
        else:
            bgram[item] = 1

    # top 10 bigrams

    for i in 10:
        print(bgram[i])

    while text != "":
        for word in bgram[:10]:
            if (text.__contains__(word)):
                print(text)
        text = file1.readline()
Пример #13
0
def quantify_negativity(lyrics, emolex):
    wnl = WordNetLemmatizer()

    negative_pct = {}
    stop_words = set(stopwords.words('english'))

    print('[+] Calculating lyrics negativity')
    for song in lyrics:
        tokens = [t.lower() for t in word_tokenize(lyrics[song]) \
         if t not in stop_words and t.isalpha()]
        tokens = set([wnl.lemmatize(t) for t in tokens])
        tagged = pos_tag(tokens)
        tokens = [x[0] for x in tagged if x[1] not in unwanted_pos_tags]

        len_tokens = len(tokens)
        total_tokens = len_tokens if len_tokens != 0 else 1
        negative_tokens = 0

        for token in tokens:
            for emotion in negative_emotions:
                if token.lower() in emolex[emotion]:
                    negative_tokens += 1
                    break

        negative_pct[song] = negative_tokens / total_tokens

    return negative_pct
Пример #14
0
    def prepare_vocab(self, word_corpus, word_embedding, topk, num_vocab):

        lemmatizer = WordNetLemmatizer()
        lemmaed_count_1w = Counter()

        with open(word_corpus, 'r') as f:
            for line in f:
                word, count = line.strip().split('\t')
                lemmaed_count_1w[lemmatizer.lemmatize(word)] += int(count)
        topk_vocab = heapq.nlargest(topk,
                                    lemmaed_count_1w,
                                    key=lemmaed_count_1w.get)

        topk_vocab_vec = OrderedDict.fromkeys(topk_vocab)
        n_non_empty = 0
        with open(word_embedding, 'r') as f:
            for line in tqdm(f):
                word, *vec = line.rstrip().split(' ')
                if word in topk_vocab_vec:
                    topk_vocab_vec[word] = np.array(vec, dtype=float)
                    n_non_empty += 1
            print('Num of non empty vectors: ', n_non_empty)

        vocab_vec = np.zeros([300, num_vocab])
        vocab = []
        num = 0
        for k, v in iter(topk_vocab_vec.items()):
            if v is not None:
                vocab_vec[:, num] = v
                vocab.append(k)
                num += 1
            if num >= num_vocab:
                break
        np.save('vocab.npy', vocab)
        np.save('vocab_vec.npy', vocab_vec)
def tokenize(document):
    lemmatizer = WordNetLemmatizer()
    "Break the document into sentences"
    for sent in sent_tokenize(document):
        "Break the sentence into part of speech tagged tokens"
        for token, tag in pos_tag(wordpunct_tokenize(sent)):

            "Apply preprocessing to the token"
            token = token.lower()  # Convert to lower case
            token = token.strip()  # Strip whitespace and other punctuations
            token = token.strip('_')  # remove _ if any
            token = token.strip('*')  # remove * if any

            "If stopword, ignore."
            if token in stopwords.words('english'):
                continue

            "If punctuation, ignore."
            if all(char in string.punctuation for char in token):
                continue

            "If number, ignore."
            if token.isdigit():
                continue

            # Lemmatize the token and yield
            # Note: Lemmatization is the process of looking up a single word form
            # from the variety of morphologic affixes that can be applied to
            # indicate tense, plurality, gender, etc.
            lemma = lemmatizer.lemmatize(token)
            # all_lema.append(lemma)
            yield lemma
Пример #16
0
def rq3():

    path_base = "commits_"
    for filename in ["big", "small"]:
        logs = get_logs(path_base + filename)
        text = " ".join([log for log in logs])
        #text = "Life is like a box of chocolates. You never know what you're gonna get."
        raw_words = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text.lower())
        table = str.maketrans('', '', string.punctuation)
        filtered_words = [
            word.translate(table) for word in raw_words
            if word not in stopwords.words('english') and len(word) > 1
        ]

        #snowball_stemmer = SnowballStemmer("english")

        #words_stem = [snowball_stemmer.stem(filtered_word) for filtered_word in filtered_words]

        wordnet_lematizer = WordNetLemmatizer()
        words_lema = [
            wordnet_lematizer.lemmatize(filtered_word)
            for filtered_word in filtered_words
        ]
        c = Counter()
        for word_lema in words_lema:
            c[word_lema] += 1
        words = c.most_common(100)
        output_keywords(filename, words)
def returnKeywordFromList(convertpath):
    token_dict = {}
    i=0

    #nltk.download()
    wnl = WordNetLemmatizer()
    fileName = {}
    #print file
    #print str(i)+ file
    #file_path = subdir + os.path.sep + file
    shakes = open(convertpath, 'r')
    text = shakes.read()
    lowers = "".join(map(lambda l:l.decode('unicode_escape').encode('ascii','ignore'),text))
    no_punctuation = re.sub(r'[?|$|.|!0-9()=+-\/\'\"\|]',r'',lowers)
    d = {v:True for v in no_punctuation.split()}
    for token in d.keys():
        no_punctuation = no_punctuation.replace(token, wnl.lemmatize(token))
    fileName[i] = file
    token_dict[i] = no_punctuation.replace("\n"," ").replace("\r","")
    #break

    #this can take some time
    ##print token_dict.values()
    tfidf_vect = TfidfVectorizer(stop_words =stops, ngram_range=(1, 2))
    # #
    # count_vect.stop_words = stops
    #
    X_train_counts = tfidf_vect.fit_transform(token_dict.values())
    #print tfidf_vect.get_feature_names()
    #print(sortSparseMatrix(X_train_counts.getrow(0),rev=False, only_indices=False))
    sortedMatrix = sortSparseMatrix(X_train_counts.getrow(0),rev=True, only_indices=False)[0]
    x = map(lambda (x,y):x,sortedMatrix)
    result = getKeywordAlgorithms(1,sortedMatrix)
    return map(lambda key:tfidf_vect.get_feature_names()[key],result)
Пример #18
0
def preprocess(x):
    """Preprocesses the combined thread and author and converts
    to tfidf features"""

    # change shoe brands to 'shoebrand'
    x.replace('(?i)asics|nike|adidas|hoka|brooks|puma|new balance|oiselle|saucony', 'shoebrand', regex=True,
              inplace=True)
    # change interval descriptions to 'intervals'
    x.replace('(?i)[0-9]+x([0-9]{2,4}|[a-z]+)', 'intervals', regex=True, inplace=True)
    # change times to 'time'
    x.replace('[0-9]{0,2}((:[0-9]{2})|(\\.[0-9]+))', 'time', regex=True, inplace=True)
    # change common race distances to 'distance'
    x.replace('(?i)[0-9]+(m|yd|km|mi|k)', 'distance', regex=True, inplace=True)
    # remove special characters
    x.replace('\\W', ' ', regex=True, inplace=True)
    # remove single characters
    x.replace('\\s+[a-zA-Z]\\s+', ' ', regex=True, inplace=True)
    # remove single character from start
    x.replace('\\^[a-zA-Z]\\s+', ' ', regex=True, inplace=True)
    # remove multiple spaces
    x.replace('\\s+', ' ', regex=True, inplace=True)
    # to lowercase
    x = x.str.lower()
    # lemmatize
    stemmer = WordNetLemmatizer()
    x.apply(lambda text: [stemmer.lemmatize(word) for word in text.split()])

    return x
Пример #19
0
def feature_extractor_top_words_weights(data):
    data = data.decode('utf-8')
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Пример #20
0
def feature_extractor_top_words_weights(data):
    """
     Extract features using the top words with weights method
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')
    # top 15 frequently-ocurring words from the tourism-related twitter corpus
    top_words = ['travel', 'vacation', 'city', 'itsmorefuninthephilippines', 'travel',
                 'boracay', 'philippine', 'view', 'day', 'beach', 'morning', 'resort', 
                 'good', 'cebu', 'island']
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    for word in words:
        if word not in stop_words:
            if word in features:
                # if word is found in the top words list, increase by 1.5 or preferred weight
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
def text2sents(text, lemmatize=False, stemmer=None):
    """
    converts a text into a list of sentences consisted of normalized words
    :param text: list of string to process
    :param lemmatize: if true, words will be lemmatized, otherwise -- stemmed
    :param stemmer: stemmer to be used, if None, PortedStemmer is used. Only applyed if lemmatize==False
    :return: list of lists of words
    """
    sents = sent_tokenize(text)

    tokenizer = RegexpTokenizer(r'\w+')

    if lemmatize:
        normalizer = WordNetLemmatizer()
        tagger = PerceptronTagger()
    elif stemmer is None:
        normalizer = PorterStemmer()
    else:
        normalizer = stemmer

    sents_normalized = []

    for sent in sents:
        sent_tokenized = tokenizer.tokenize(sent)
        if lemmatize:
            sent_tagged = tagger.tag(sent_tokenized)
            sent_normalized = [normalizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in sent_tagged]
        else:
            sent_normalized = [normalizer.stem(w) for w in sent_tokenized]

        sents_normalized.append(sent_normalized)
    return sents_normalized
Пример #22
0
def lemmatizing(line_list):
    """
    Input: line_list (list of strings(sentences/documents)) - e.g. dataset.data

    Iterates over all terms in lines, lemmatize them using WordNetLemmatizer()

    Return: lemmatized_list (list of strings(terms that stemmed))
    """
    lemmatized_list = []
    lemmatizer = WordNetLemmatizer()
    for i, line in enumerate(line_list):
        # linercase
        line = line.lower()
        # remove punctuation
        # below method will simply remove punctuation, but mistakes such as amazon.com => amazoncom
        # nopunct_line = ''.join([c for c in line 
                                            # if re.match("[a-z\-\' \n\t]", c)])
        # this solve the problem above:
        nopunct_line = re.sub('[^A-Za-z0-9]+', ' ', line)                                            
        # tokenize
        line_token = wt(nopunct_line)
        # stemming
        lemmatized_line = []
        for term in line_token:
            term = lemmatizer.lemmatize(term)
            lemmatized_line.append(term)
        # back to sentence as a string
        lemmatized_sentence = ' '.join(lemmatized_line)
        lemmatized_list.append(lemmatized_sentence)
    return lemmatized_list
Пример #23
0
def preprocess(original_str):
	# stemmer
	wnl = WordNetLemmatizer()
	# pos
	original_str = unicode(original_str, errors='ignore')
	print type(original_str)
	article_tok = pos_tag(word_tokenize(original_str))
	print type(article_tok)
	print "token: "
	print article_tok

	# choose Noun
	str_noun = ''
	for word, tag in article_tok:
		if ("NN" in tag) or ("JJ" in tag):
			# print(word,":",tag)
			# print(wnl.lemmatize(word))
			try:
				stemming_word = wnl.lemmatize(word)
				print stemming_word
				if len(word) > 1:
					str_noun = str_noun + stemming_word + " "
			except UnicodeDecodeError as e:
				print "error: " + word
			# end if



	# result
	# final_doc.append(str_noun)
	# print "return_preprocess : " + str_noun

	return str_noun
def lemmstem(sentences):
    ''' This function is responsible for perfoming 
        the lemmarization and stemming of the words
        Input: A list of trees containing the sentences.
                All words are classificated by their NE type
        Output: Lemmatized/Stemmized sentences
    '''
    
    lmtzr = WordNetLemmatizer()
    st = LancasterStemmer()
    
    dic = {'VB' :wordnet.VERB,
            'NN': wordnet.NOUN,
            'JJ':wordnet.ADJ,
            'RB':wordnet.ADV }
    
    for sent in sentences:
      
        lvsidx=sent.treepositions('leaves') 
       
        for pos in lvsidx:
            word=sent[pos][0]
            tag = sent[pos][1]
            rtag = tag[0:2]
            if rtag in dic:
                lemm=lmtzr.lemmatize( word, dic[rtag] )
                stem=st.stem(lemm)
                #print word, lemm, stem #Linia maldita
                sent[pos]=(word, tag, stem)
            else:
                sent[pos]=(word, tag, word)
    
    return sentences
Пример #25
0
def feature_extractor_tripadvisor_top_words_weights(data):
    data = data.decode('utf-8')

    top_file = open('scraper/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    places_file = open('scraper/places.txt', 'r')

    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Пример #26
0
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        # print("Inside NLTk cosnt")
        self.lower = lower
        self.strip = strip
        self.stopwords = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        #print("In fit")
        return self

    def inverse_transform(self, X):
        #print("In inverse")
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        #print("In transorm")
        return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        # Break the document into sentences
        #print("In tokenize")
        for sent in sent_tokenize(document):
            print("sent", sent)
            # Break the sentence into part of speech tagged tokens
            try:
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    # Apply preprocessing to the token
                    token = token.lower() if self.lower else token
                    token = token.strip() if self.strip else token
                    token = token.strip('_') if self.strip else token
                    token = token.strip('*') if self.strip else token

                    # If stopword, ignore token and continue
                    # if token in self.stopwords:
                    #     continue

                    # If punctuation, ignore token and continue
                    if all(char in self.punct for char in token):
                        continue

                    # Lemmatize the token and yield
                    lemma = self.lemmatize(token, tag)
                    yield lemma
            except:
                print("In token tag")

    def lemmatize(self, token, tag):
        #print("In leammarize")
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
Пример #27
0
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        self.stopwords = set(stopwords) if stopwords else set(
            sw.words('english'))
        self.punct = punct if punct else set(string.punctuation)
        self.lower = lower
        self.strip = strip
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return X

    def transform(self, X):
        return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        """
        Returns a normalized, lemmatized list of tokens from a document by
        applying segmentation (breaking into sentences), then word/punctuation
        tokenization, and finally part of speech tagging. It uses the part of
        speech tags to look up the lemma in WordNet, and returns the lowercase
        version of all the words, removing stopwords and punctuation.
        """
        # Break the document into sentences
        for sentence in sent_tokenize(document):
            # Break the sentence into part of speech tagged token
            for token, tag in pos_tag(wordpunct_tokenize(sentence)):
                # Applying preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If punctuation of stopword, ignore the token and continue
                if token in self.stopwords or all(char in self.punct
                                                  for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)

                yield lemma

    def lemmatize(self, token, tag):
        """
        Converts the Penn Treebank tag to a WordNet POS tag, then uses that
        tag to perform much more accurate WordNet lemmatization.
        """
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
Пример #28
0
	def process_words(self,text):	
		result = []
		lem = WordNetLemmatizer()
		word_tokens = word_tokenize(text)
		for word in word_tokens:	
			if word not in stop_words:	
				result.append(lem.lemmatize(word))
		return ','.join(result).replace(',',' ')
Пример #29
0
def preprocess(sentece):
    """ 
    Preprocess the data splitting the words and linking
    the different form of the same word
    """
    tokens = word_tokenize(sentece)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word.lower()) for word in tokens]
def get_lemmatized_text(corpus):
    import nltk
    nltk.download('wordnet')
    lemmatizer = WordNetLemmatizer()
    return [
        ' '.join([lemmatizer.lemmatize(word) for word in review.split()])
        for review in corpus
    ]
Пример #31
0
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas
Пример #32
0
def preprocess(text):
    lemma = WordNetLemmatizer()
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]

    tokens = [lemma.lemmatize(word.lower(), pos="v") for word in tokens]
    tokens = [lemma.lemmatize(word.lower(), pos="n") for word in tokens]
    return tokens
Пример #33
0
def preprocess(sentence):
    lemmatizer = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    return [
        lemmatizer.lemmatize(word.lower())
        for word in tokenizer.tokenize(unicode(sentence, errors='ignore'))
        if not word.startswith('/')
    ]
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self,
                 stopwords=[],
                 punct=[],
                 lower=True,
                 strip=True,
                 lemmatize=True,
                 ignore_type=[]):
        self.lower = lower
        self.strip = strip
        self.ignore_type = ignore_type
        self.stopwords = stopwords
        self.punct = punct
        self.do_lemmatize = lemmatize
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [list(self.tokenize(doc)) for doc in X]

    def tokenize(self, document):
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                if token in self.stopwords:
                    continue

                if all(char in self.punct for char in token):
                    continue

                if self.do_lemmatize:
                    lemma = self.lemmatize(token, tag, self.ignore_type)
                    yield lemma
                else:
                    yield token

    def lemmatize(self, token, tag, ignore_type=['N']):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        # Ignore nouns by default to account for plurals
        if tag in ignore_type:
            return token
        else:
            return self.lemmatizer.lemmatize(token, tag)
Пример #35
0
def tokenize_cleaned(document, th_tokenizer, thai_char,
                     stopwords_en, stopwords_th, keywords):
    """
    Tokenize and lemmatize tokens in document.


    :param document: Document in string
    :param th_tokenizer: Thai tokenizer function (return list)
    :param thai_char: re.compile patter containing all Thai alphabets.
    :param stopwords_en: set() of English stop word
    :param stopwords_th: set() of Thai stop word
    :param keywords: set() of keywords.
    :return: list of tokens.
    """
    from copy import deepcopy
    from nltk import WordNetLemmatizer
    import nltk

    if './Resource/nltk_data' not in nltk.data.path:
        nltk.data.path.append('./Resource/nltk_data')
    if '../Resource/nltk_data' not in nltk.data.path:
        nltk.data.path.append('../Resource/nltk_data')

    def test_all_en_alpha(text):  # test if characters in the string are all English alphabet.
        roman_alpha = [chr(alpha) for alpha in range(65, 90)] + \
                      [chr(alpha) for alpha in range(97, 122)]
        for alpha in text:
            if alpha not in roman_alpha:
                return False
        return True

    word_stem_func = WordNetLemmatizer()  # declare English lemmatizer.

    document = deepcopy(document)
    document = document.split(' ')  # split to form a list of phrases which are separated by '\s'
    # remove English stop word.
    document = [token.lower() for token in document
                if token not in stopwords_en]
    # Lemmatize English tokens.
    document = [word_stem_func.lemmatize(token)
                if test_all_en_alpha(token) and token not in keywords  # do not lemmatize keywords.
                else token
                for token in document]

    # tokenize Thai phrase.
    tokenized = []
    for token in document:
        if thai_char.search(token):  # check if phrase is in Thai
            tokenized.extend(th_tokenizer(token))  # extend to include a list of Thai tokens
        else:
            tokenized.append(token)  # append non-Thai tokens

    # remove Thai stop word
    for token_index in reversed(range(len(tokenized))):  # iterate backward
        if tokenized[token_index] in stopwords_th:  # if token is Thai stop word
            tokenize_document.pop(token_index)  # remove Thai stop word from doc

    return tokenized
def Preprocessing(df, contractions):
    pd.options.mode.chained_assignment = None
    contractionsDict = {}
    for i in contractions['data']:
        contractionsDict[i[0]] = i[1]

    # remove url
    df['sentence'] = df['sentence'].str.replace('http\S+|www.\S+',
                                                '',
                                                case=False)

    # remove number
    df['sentence'] = df['sentence'].str.replace('\d+', '')

    # remove hashtags
    df['sentence'] = df['sentence'].str.replace('#(\w+)', '')

    # change all text with contraction
    for index, row in df.iterrows():
        row[1] = ' '.join([
            str(x) for x in [
                contractionsDict[t] if t in contractionsDict.keys() else t
                for t in [e.lower() for e in row[1].split()]
            ]
        ])

    # remove stopword
    stop_words = []
    for word in stopwords.words('english'):
        stop_words.append(word) if ('not' not in word
                                    and 'no' not in word) else stop_words

    # remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = ' '.join(
            [w for w in word_tokens if not w.lower() in stop_words])

    # using lemmetizer
    wordnet_lemmatizer = WordNetLemmatizer()
    for index, row in df.iterrows():
        row[1] = ' '.join(
            wordnet_lemmatizer.lemmatize(t) for t in row[1].split())

    # remove non-english word
    english_words = set(nltk.corpus.words.words())
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = " ".join(w for w in word_tokens
                          if w.lower() in english_words or not w.isalpha())

    # remove non-alphabetic characters
    for index, row in df.iterrows():
        word_tokens = tokenizer.tokenize(row[1])
        row[1] = " ".join(w for w in word_tokens if w.isalpha())

    return df
Пример #37
0
class NLTKPreprocessor(BaseEstimator, TransformerMixin):
    #it loads a variety of corpora and models for use in tokenization.
    #By default the set of english stopwords from NLTK is used, and the WordNetLemmatizer
    #looks up data from the WordNet lexicon. Note that this takes a noticeable amount of time,
    #and should only be done on instantiation of the transformer.

    def __init__(self, stopwords=None, punct=None, lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        self.stopwords = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [list(self.tokenize(doc)) for doc in X]

        #The tokenize method breaks raw strings into sentences,
        #then breaks those sentences into words and punctuation,
        #and applies a part of speech tag. The token is then normalized:
        #made lower case, then stripped of whitespace and other types of punctuation that may be appended.
        #If the token is a stopword or if every character is punctuation, the token is ignored.
        #If it is not ignored, the part of speech is used to lemmatize the token, which is then yielded
    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue
                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)
Пример #38
0
def pre_tokenize(train, dev, test):
    from nltk.tokenize import TweetTokenizer
    from nltk.corpus import stopwords

    tt = TweetTokenizer()
    stopwords = set(stopwords.words('english'))
    x_train = []
    x_dev = []
    x_test = []
    wnl = WordNetLemmatizer()
    stemmer = PorterStemmer()
    for w in train:
        list_words = tt.tokenize(w)
        list_words = [
            w.lower() for w in list_words if isinstance(w, str) == True
        ]
        list_words = [w for w in list_words if w.isalpha()]
        filtered_words = [w for w in list_words if not w in stopwords]
        filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words]
        filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words]
        filtered_words = [stemmer.stem(w) for w in filtered_words]
        x_train.append(' '.join(filtered_words))
    for w in dev:
        list_words = tt.tokenize(w)
        list_words = [
            w.lower() for w in list_words if isinstance(w, str) == True
        ]
        list_words = [w for w in list_words if w.isalpha()]
        filtered_words = [w for w in list_words if not w in stopwords]
        filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words]
        filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words]
        filtered_words = [stemmer.stem(w) for w in filtered_words]
        x_dev.append(' '.join(filtered_words))
    for w in test:
        list_words = tt.tokenize(w)
        list_words = [
            w.lower() for w in list_words if isinstance(w, str) == True
        ]
        list_words = [w for w in list_words if w.isalpha()]
        filtered_words = [w for w in list_words if not w in stopwords]
        filtered_words = [wnl.lemmatize(w, 'n') for w in filtered_words]
        filtered_words = [wnl.lemmatize(w, 'v') for w in filtered_words]
        filtered_words = [stemmer.stem(w) for w in filtered_words]
        x_test.append(' '.join(filtered_words))
    return x_train, x_dev, x_test
Пример #39
0
class WordComplexityLexicon:
    def __init__(self, lexicon):
        word_ratings = {}
        for line in open(lexicon):
            tokens = [t.strip() for t in line.strip().split('\t')]
            word_ratings[tokens[0].lower()] = float(tokens[1])
        self.word_ratings = word_ratings
        self.lemmatizer = WordNetLemmatizer()
        self.lancaster_stemmer = LancasterStemmer(strip_prefix_flag=True)
        self.snowball_stemmer = SnowballStemmer("english")

    def get_feature(self, words):
        phrase = max(words, key=len)

        if phrase in self.word_ratings:
            return [self.word_ratings[phrase], 1.0]
        else:
            ratings = []
            lemman = self.lemmatizer.lemmatize(phrase, pos='n')
            lemmav = self.lemmatizer.lemmatize(phrase, pos='v')
            lemmaa = self.lemmatizer.lemmatize(phrase, pos='a')
            lemmar = self.lemmatizer.lemmatize(phrase, pos='r')
            stem_lan = self.lancaster_stemmer.stem(phrase)
            try:
                stem_snow = self.snowball_stemmer.stem(phrase)
            except TypeError:
                stem_snow = ""

            if lemman in self.word_ratings:
                ratings.append(self.word_ratings[lemman])
            elif lemmav in self.word_ratings:
                ratings.append(self.word_ratings[lemmav])
            elif lemmaa in self.word_ratings:
                ratings.append(self.word_ratings[lemmaa])
            elif lemmar in self.word_ratings:
                ratings.append(self.word_ratings[lemmar])
            elif stem_snow in self.word_ratings:
                ratings.append(self.word_ratings[stem_snow])
            elif stem_lan in self.word_ratings and abs(len(stem_lan) - len(phrase)) <= 2:
                ratings.append(self.word_ratings[stem_lan])

            if len(ratings) > 0:
                return [max(ratings)*1.0, 1.0]

        return [0.0, 0.0]
Пример #40
0
def process(comment):
    lemma = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(comment)
    filtered_sentence = ""
    for w in tokens:
        if not w in stop_words:
            filtered_sentence = filtered_sentence + lemma.lemmatize(w) + " "
    return filtered_sentence
def words_and_types(text):
    tokens = [word.lower() for word in word_tokenize(text) if word.isalpha()]
    wordtypes_ordered = nltk.pos_tag(tokens, tagset='universal')
    wordtypes = dict(wordtypes_ordered)
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word, pos = wordnet_tag(wordtypes[word])) for word in tokens]
    wordtypes = dict([(tokens[i], wordtypes_ordered[i][1]) for i in range(len(tokens))])
    sys.stdout.flush()
    return tokens, wordtypes
Пример #42
0
def word_extractor2(text):
	wordlemmatizer = WordNetLemmatizer()
	text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
	words = ""
	wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
	for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
	for word in wordtokens:
		words+=" "+word
	return words
Пример #43
0
def Check(mArray):
  
  # what am I checking?
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item)
  
  # converts to a string
  return ''.join(item)
Пример #44
0
class Preprocessor(object):
    def __init__(self):
        self.tokenizer = RegexpTokenizer(r'\w+')
        self.stopwords_eng = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()

    def __call__(self, doc):
        return [
            self.lemmatizer.lemmatize(t) for t in self.tokenizer.tokenize(doc)
        ]

    def process(self, text):
        tokens = self.tokenizer.tokenize(text.lower())
        tokens_processed = []
        for t in tokens:
            if t in self.stopwords_eng: continue
            tokens_processed.append(self.lemmatizer.lemmatize(t))
        return tokens_processed
Пример #45
0
def lemmatize(tokens_list):
    """
        Uses WordNet lemmatizer to lemmatize
    """
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    for i in tokens_list:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(i, get_pos(i)))
    return lemmatized_list
Пример #46
0
def Check(mArray):

  #what am I checking?
  #Taking the 2nd item in the array since popopen puts the file path as the first item.
  item = mArray[1]
  lmtzr = WordNetLemmatizer()
  item = lmtzr.lemmatize(item, get_wordnet_pos(item))
    
  #converts to a string
  return ''.join(item)
def add_lemmatizer():
    in_fp = open(word_topic_file)
    out_fp = open(word_topic_lexeme_file,  'w')
    wnl = WordNetLemmatizer()
    ###
    line = ''
    line_num = 0
    while 1 and line_num < max_line_num:
        line = in_fp.readline()
        line = line.strip()
        line_words = line.split(' ')
        line_write = ''
        for words in line_words:
            word_topic = words.split(':')
            word_id = word_topic[0]
            topic_id = word_topic[1]
            line_write += word_id
            line_write += ':'
            line_write += topic_id
            line_write += ':'
            ##
            if id_word_dict.has_key(word_id):
                word = id_word_dict[word_id]
                if word_lexeme_id_dict.has_key(word):
                    line_write += word_lexeme_id_dict[word]
                    line_write += ' '
                else:
                    word_list = []
                    word_list.append(word)
                    pos = pt(word_list)
                    tag = pos[0][1]
                    lexeme = wnl.lemmatize(word,  penn_to_wn(tag))
                    #print ': ', word,  lexeme
                    if word_id_dict.has_key(lexeme):
                        lexeme_id = word_id_dict[lexeme]
                        word_lexeme_id_dict[word] = lexeme_id
                        line_write += lexeme_id
                        line_write += ' '
                    else:
                        word_lexeme_id_dict[word] = word_id
                        line_write += word_id
                        line_write += ' '
                
            ##
        line_write = line_write.strip()
        out_fp.write(line_write)
        if line_num < max_line_num -1:
            out_fp.write('\n')
        line_num += 1
        if line_num%1000 ==0:
            print 'line: ', line_num
    ###
    in_fp.close()
    out_fp.close()
class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        #self.stopwords  = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                # if token in self.stopwords:
                #     continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
 def check_whole_collocation(self, sentence):
     """
     Generate part1 + part2 possible variants
     :param col
     :return: return collocation -- (col1, col2) or False
     """
     collocate_set = set()
     lmtzr = WordNetLemmatizer()
     lemmas1 = lmtzr.lemmatize(self.first_col)
     lemmas2 = lmtzr.lemmatize(self.second_col)
     try:
         for lem1 in self.lemma_dictionary[lemmas1]:
             for lem2 in self.lemma_dictionary[lemmas2]:
                 collocate_set.add((lem1, lem2))
     except:
         collocate_set.add((lemmas1, lemmas2))
     for col in collocate_set:
         my_col = col[0] + " " + col[1]
         if my_col in sentence:
             return col
     return False
Пример #50
0
def wn_lemmatize(lemma):
    """
    Auxiliary function for pos_lemmatizing (below)

    Lemmatize the supplied (word, pos) pair using
    nltk.stem.WordNetLemmatizer. If the tag corresponds to a
    WordNet tag, then we convert to that one and use it, else we
    just use the strong for lemmatizing.
    """        
    string, tag = lemma
    string = string.lower()
    tag = tag.lower()
    wnl = WordNetLemmatizer()
    if tag.startswith('v'):    tag = 'v'
    elif tag.startswith('n'):  tag = 'n'
    elif tag.startswith('j'):  tag = 'a'
    elif tag.startswith('rb'): tag = 'r'
    if tag in ('a', 'n', 'r', 'v'):        
        return wnl.lemmatize(string, tag)
    else:
        return wnl.lemmatize(string) 
Пример #51
0
def review_to_words(raw_review, need_to_lemmatize=False):
    # Function to convert a raw review to a string of words
    # optional lemmatization
    #
    meaningful_words = review_to_wordlist(raw_review)

    if need_to_lemmatize:
        wnl = WordNetLemmatizer()
        meaningful_words = [wnl.lemmatize(w) for w in meaningful_words]

    # 6. Join the words back into one string separated by space
    return " ".join(meaningful_words)
Пример #52
0
def feature_extractor(data):
    data = data.decode('utf-8')
    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    for word in words:
        if word not in stop_words:
            if word in features:
                features[word] += 1
            else:
                features[word] = 1

    return features
Пример #53
0
def get_words(document):
    '''
    Return a list of unique words in document
    '''
    regex1 = re.compile('\W')          # match non-alphanumeric
    regex2 = re.compile('&(#)*(\w)*;')  # match html entities
    regex3 = re.compile('( ){2,}')      # match more than 2 spaces
    lemmatizer = WordNetLemmatizer()
    tokenizer  = WhitespaceTokenizer()
    # lowercase document, remove punctuation, and html entities
    document   = regex3.sub(' ', regex2.sub(' ', regex1.sub(' ', document.lower())))
    words = [
             lemmatizer.lemmatize(word)
             for word in tokenizer.tokenize(document)
             if word not in STOPWORDS and len(word) > 2
            ]
    return FreqDist(words)
Пример #54
0
def feature_extractor_tripadvisor_top_words_weights(data):
    """
     Extract features using the top words with weights 
     method using words from TripAdvisor
     parameter: data (tweet)
     returns: returns features of the given data
    """
    data = data.decode('utf-8')

    # retrieve file of top 100 frequently-occurring words from TripAdvisor comments
    top_file = open('classifier/top_words.txt', 'r')
    top_words = [word.replace('\n', '') for word in top_file]
    # retrieve file of 100 places from TripAdvisor
    places_file = open('classifier/places.txt', 'r')

    # clean places file
    for place in places_file:
        place = place.replace('\n', '')
        for word in place.split(' '):
            if word != '-':
                top_words.append(word)

    features = {}
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    # preprocessing: tokenize, convert to lowercase and lemmatize words
    words = [lemmatizer.lemmatize(word.lower()) for word in word_tokenize(data)]

    # remove stop words and add words and their frequencies as features
    # if word is found in the top words list, increase by 1.5 or preferred weight
    for word in words:
        if word not in stop_words:
            if word in features:
                if word in top_words:
                    features[word] += 1.5
                else:
                    features[word] += 1
            else:
                if word in top_words:
                    features[word] = 1.5
                else:
                    features[word] = 1

    return features
Пример #55
0
    def __word_cleaner(self, sentence):
        """ Removes the unwanted words in the sentence. """
        features = {}
        words = {}
        lematizer = WordNetLemmatizer()

        # get individual words from text
        words = [lematizer.lemmatize(word.lower()) for word in \
                 word_tokenize(sentence)]
        final_words = []

        for word in words:
            word = word.encode('utf-8', 'ignore')
            if len(word) > 1:
                # check if word in not a stop word
                if word not in stopwords.stop_words:
                    final_words.append(word)
        return ' '.join(final_words)
Пример #56
0
 def _lemmatize_words(text):
     """Lemmatize all words in the text."""
     lemmatizer = WordNetLemmatizer()
     lemmatizations = {}
     tokens = text.split()
     for word in tokens:
         if word not in lemmatizations:
             lemmatizations[word] = lemmatizer.lemmatize(word)
     for i in xrange(5):  # Need to repeat several times to be safe
         tokens = text.split()
         for j in xrange(len(tokens)):
             try:
                 tokens[j] = lemmatizations[tokens[j]]
             except KeyError:
                 # During last pass, words were turned into their lemmas, which don't
                 # have entries in lemmatizations
                 pass
     text = ' '.join(tokens)
     return text
Пример #57
0
def feature_extractor(d):
    features = {}
    words = {}
    lematizer = WordNetLemmatizer()

    # get individual words from text
    words = [lematizer.lemmatize(word.lower()) for word in word_tokenize(d)]

    for word in words:
        word = word.encode('utf-8', 'ignore')
        if len(word) > 1:
            # check if word in not a stop word
            if word not in stopwords.stop_words:
                # check if the word is not a url or @person
                if not re.match('http://.*|@.*', word):
                    if word in features:
                        features[word] += 1
                    else:
                        features[word] = 1
    return features
Пример #58
0
def word_extractor2(text, sw):
	wordlemmatizer = WordNetLemmatizer()
	#Se obtienen stopwords del idioma ingles
	commonwords = stopwords.words('english')
	text = re.sub(r'([a-z])\1+', r'\1\1', text)
	words = ""
	#Se realiza lower-casing y lematizacion
	wordtokens = [wordlemmatizer.lemmatize(word.lower()) \
		     for word in word_tokenize(text.decode('utf-8', 'ignore'))]
	
	#Se eliminan tokens pertenecientes al conjunto de stopwords, en caso de que sw == True
	if sw == True:
		for word in wordtokens:
			if word not in commonwords:
				words += " " + word
	else:
		for word in wordtokens:
			words += " " + word	

	return words
Пример #59
0
def tokens(document, lowercase=True, tokenize='no_digits', 
           stopwords=False, stemming=None, lemmatize=False):
    """Tokenize a raw string based on passed tokenization options."""
    raw_doc = unicode(document, errors='ignore')

    # adjust case
    if lowercase:
        raw_doc = raw_doc.lower()

    # tokenize according to specifications
    if tokenize == 'symbols':
        tokenizer = RegexpTokenizer(r'[\'\w\-]+')
    if tokenize == 'no_symbols':
        tokenizer = RegexpTokenizer(r'\w+')
    if tokenize == 'no_digits':
        tokenizer = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokenizer.tokenize(raw_doc)

    if not stopwords:
        stop = sw.words('english')
        tokens = [word for word in tokens if word not in stop]

    if stemming and lemmatize:
        print ('Error: can only choose one of stemming or lemmatize. '
               'Choosing stemming')
        lemmatize = False

    if stemming:
        if stemming == 'porter':
            stemmer = PorterStemmer()
        if stemming == 'lancaster':
            stemmer = LancasterStemmer()
        if stemming == 'snowball':
            stemmer = SnowballStemmer('english')
        tokens = [stemmer.stem(word) for word in tokens]

    if lemmatize:
        wnl = WordNetLemmatizer()
        tokens = [wnl.lemmatize(word) for word in tokens]

    return tokens
Пример #60
0
def tokenize(article):
    '''
    INPUT string
    OUTPUT list

    This is a tokenizer to replace the default tokenizer in TfidfVectorizer
    '''
    stop = stop_words()
    tokens = [word.lower() for word in word_tokenize(article)]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    # now remove stop words
    tokens = [word for word in tokens if word not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    return tokens