示例#1
0
 def stemming_phrases(self, phrase):
     porter_stemmer = PorterStemmer()
     tokenize = [docs for docs in phrase.split(" ")]
     stemmazied_phrase_lists = []
     for word in tokenize:
         stemmazied_phrase_lists.append(porter_stemmer.stem(word))
     return " ".join(stemmazied_phrase_lists)
示例#2
0
def api():
    content = request.json
    text = content["message"]
    nltk.download('punkt')
    nltk.download('stopwords')
    related_news = relatednews(text)
    NBVocab = open('NBVocab.pkl', 'rb')
    cv = joblib.load(NBVocab)
    model = open('model.pkl', 'rb')
    clf = joblib.load(model)
    ps = PorterStemmer()
    sw = set(stopwords.words('english'))
    sw.remove('not')
    sw.remove('no')
    sw.add('\n')
    text = text.lower()
    tokenizer = RegexpTokenizer('[A-z]+')
    word_list = tokenizer.tokenize(text)
    clean_list = [w for w in word_list if w not in sw]
    stemmed_list = [ps.stem(w) for w in clean_list]
    clean_text = ' '.join(stemmed_list)
    X_vec = cv.transform([clean_text])
    pred = clf.predict(X_vec)
    pred = pred[0]
    return jsonify({"prediction": pred, "related_news": related_news})
示例#3
0
def removepunct_tokenize_stem(text):
    text = "".join([ch for ch in text
                    if ch not in string.punctuation])  #Remove punctuation
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    final = [stemmer.stem(item) for item in tokens]
    return final
示例#4
0
def doStemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = []
    for w in tokens:
        stemmed_tokens.append(ps.stem(w))

    return stemmed_tokens
def generate_frequencies(labeled_data,  filter_threshold=0.03):
    stemmer = PorterStemmer()
    stop_words = stopwords.words('english')
    categories = dict()  # dict(category_name, {num_docs : int, counts : Counter(words)})
    # word_tokenize = lambda x: RegexpTokenizer(r'\w+').tokenize(x)

    for doc in labeled_data:
        category = doc["Category"].lower()  # some of the labels are inconsistent in case
        # if category == 'uninformative':
        #    continue
        if category not in categories.keys():
            categories[category] = {'num_docs': 1, 'counts': Counter()}
        else:
            categories[category]['num_docs'] += 1

        # use word_tokenize to parse words, make unique, remove stopwords
        # leaves non word things like '?', and "`", in input
        # NOTE: 2/27/20 -- Found forgot to call lower here
        message = doc["message"].lower().strip()
        message = word_tokenize(message)

        segmented_message = []
        for wd in message:
            segmented_message.append(wd)
            segments = wordsegment.segment(wd)
            if len(segments) > 1:
                segmented_message.extend(segments)

        processed_message = [stemmer.stem(wd) for wd in segmented_message
                             if wd not in stop_words and
                             sum(map((lambda x: 1 if x[1].isalnum() else 0),
                                     enumerate(wd))) > 0]

        for wd in processed_message:
            categories[category]['counts'][wd] += 1

    term_freqs = deepcopy(categories)
    doc_freqs = Counter()

    for cat in categories:
        category = categories[cat]
        for wd in category['counts']:

            # calculate term frequency % (within a single category)
            # Note: can also do number of times word appears across all categories
            count = category['counts'][wd]
            freq = count / category['num_docs']
            if freq < filter_threshold:
                del term_freqs[cat]['counts'][wd]
            # else:
                # print(cat, " : ('", wd, "', ", freq, ")", sep='')

            # Increase document frequency (here doc refers to category)
            # each word should appear only once per category,
            # so this counts number of categories a word appears in

            doc_freqs[wd] += 1

    return term_freqs, doc_freqs
示例#6
0
def tokenizer_porter(text):
    """
    Split the comment text into a single word and extract the stem,
    :param text:
    :return:
    """
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]
示例#7
0
def doStemming(tokens):
    ps = PorterStemmer()
    stemmed_tokens = []
    for w in tokens:
        stemmed_tokens.append(ps.stem(w))

    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAfter Stemming = ",
          stemmed_tokens)
    return stemmed_tokens
示例#8
0
def stemmer(tokens):
	'''
	Simple stemming loop for general use throughout project. Will stem all tokens in a list.
	'''
	ps = PorterStemmer()
	stemmed = list()
	for t in tokens:
		stemmed.append(ps.stem(t))
	return stemmed
示例#9
0
 def get_tokens(self, text, n=-1):
     article = TextSplitter().text_splitter(text)
     stemmer = PorterStemmer()
     stems = FreqDist()
     for word in article:
         lower_word = word.lower()
         # filter the stop words
         if not lower_word in stopwords.words(DOC_LANG):
             stems.inc(stemmer.stem_word(lower_word))
     # top n frequency items
     if n == -1: return stems.items()
     return stems.items()[:n]
示例#10
0
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix:
    matrix_loc = Path('data', name, 'tf_idf_matrix.pickle')

    if matrix_loc.exists():
        logger.info("Matrix exists! loading...")
        with matrix_loc.open('rb') as f:
            matrix = pickle.loads(f.read())
            return matrix

    stemmer = PorterStemmer()
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True)

    texts = []
    for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."):
        text = tweet.text

        tokens = tokenizer.tokenize(text)
        text_proc = []
        for token in tokens:
            token = token.strip()
            if len(token) < 3:
                continue
            elif token in stopwords.words('english'):
                continue
            elif nlp_utils.match_url(token):
                continue
            elif token in string.punctuation:
                continue
            # elif token.startswith(("#", "$")):
            #     continue

            token = token.translate({ord(k): "" for k in string.punctuation})
            token = stemmer.stem(token)

            token = token.strip()
            if token == "":
                continue

            text_proc.append(token)

        texts.append(text_proc)

    vectorizer = TfidfVectorizer(analyzer="word",
                                 tokenizer=lambda x: x,
                                 lowercase=False)
    m = vectorizer.fit_transform(texts)

    logger.info("Saving computed matrix...")
    with matrix_loc.open('wb') as f:
        f.write(pickle.dumps(m))

    return m
示例#11
0
def textprocessing(text):
    text = str(text)
    stemmer = PorterStemmer()
    text.replace('`', "")
    text.replace("\"", "")
    re_sp = re.sub(r'\s*(?:([^a-zA-Z0-9._\s "])|\b(?:[a-z])\b)', " ",
                   text.lower())
    text = re.sub("[!@#$%\n^'*)\\(-=]", " ", re_sp)
    no_char = ' '.join([w for w in text.split() if len(w) > 3]).strip()
    filtered_sp = [
        w for w in no_char.split(" ") if not w in stopwords.words('english')
    ]
    stemmed_sp = [stemmer.stem(item) for item in filtered_sp]
    filtered_sp = ' '.join([x for x in filtered_sp])
    return filtered_sp
示例#12
0
def pipeline_csv(headlines):
    headlines['headline'] = headlines['headline'].apply(nltk.word_tokenize)
    stemmer = PorterStemmer()
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [stemmer.stem(y) for y in x])
    lemmatizer = nltk.WordNetLemmatizer()
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [lemmatizer.lemmatize(y) for y in x])
    stopwords = nltk.corpus.stopwords.words('english')
    stemmed_stops = [stemmer.stem(t) for t in stopwords]
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [stemmer.stem(y) for y in x if y not in stemmed_stops])
    headlines['headline'] = headlines['headline'].apply(
        lambda x: [e for e in x if len(e) >= 3])
    headlines['headline'] = headlines['headline'].str.join(" ")
    return headlines
示例#13
0
def stemmed_words(doc):
    '''
    This function is normally called as the sklearn vectorizer's analyzer
    so that tokenize can be performed when a vectorizer is initialized
    
    Inputs:
    doc: the untokenized text body of a document
    
    Returns:
    The tokenized version of the document
    
    E.g. vectorizer = CountVectorizer(lowercase=True, analyzer=stemmed_words)
    '''
    stemmer = PorterStemmer()
    analyzer = TfidfVectorizer().build_analyzer()
    return (stemmer.stem(w) for w in analyzer(doc))
示例#14
0
 def __init__(self, corpus, expanded_urls):
     self.tokenizer = TweetTokenizer()
     self.stemmer = PorterStemmer()
     self.stopwords = stopwords.words('english')
     self.corpus = corpus
     self.expanded_urls = expanded_urls
     self.re_url = r'http\S+'
     self.punctuation = string.punctuation
     self.stanford_pos_pwd = '/Users/mquezada/stanford-postagger-full-2015-12-09/'
     self.stanford_pos = StanfordPOSTagger(
         self.stanford_pos_pwd + 'models/english-left3words-distsim.tagger',
         self.stanford_pos_pwd + 'stanford-postagger.jar')
     self.tag_vocab = defaultdict(Counter)
     self.tag_token = dict()
     self.vocab = defaultdict(set)
     self.tags = Counter()
示例#15
0
def clean_text(text):
    x = re.compile('<.*?>')
    text = re.sub(x, '', text)

    stop_words = set(stopwords.words('english'))  # obtain the stop words
    good_words = []  # save the correct words to consider like tokens
    tokenizer = RegexpTokenizer("[\w']+")  # function to recognize the tokens
    words = tokenizer.tokenize(text)  # tokenize the text
    for word in words:
        # check if the word is lower and it isn't a stop word or a number
        if word.lower() not in stop_words and word.isalpha():
            word = PorterStemmer().stem(word)  # use the stemmer function
            good_words.append(
                word.lower())  # insert the good token to lower case

    return good_words
示例#16
0
 def __init__(self, data_loader):
     self.data = data_loader
     self.tokenizer = TweetTokenizer()
     self.stemmer = PorterStemmer()
     self.stopwords = stopwords.words('english')
     self.re_url = r'http\S+'
     self.punctuation = string.punctuation
     self.vocab = defaultdict(set)
示例#17
0
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = ["no", "nor", "not", "over", "under", "again", "further",
                        "but", "against", "too", "very"]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    print(stopw)
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
def process_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLS
    text = re.sub(r'^https?:\/\/.*[\r\n]*', "", text)

    # Extract Alfanumberic Tokens
    tokens = re.findall(r'\w+', text)

    # Remove Stopwords
    list_stopwords = stopwords.words("portuguese")
    tokens = [word for word in tokens if word not in list_stopwords]

    # Stemming
    snow_stemmer = PorterStemmer()
    tokens = [snow_stemmer.stem(word) for word in tokens]

    return " ".join(tokens)
示例#19
0
def extract_keywords(sentence):
    sentence = sentence.lower()
    not_stopw = [
        "no", "nor", "not", "over", "under", "again", "further", "but",
        "against", "too", "very"
    ]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    print(stopw)
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    sentence = sentence.replace('\n', '')
    sentence = sentence.replace("n't", " not")
    sentence = clean_string(sentence)
    sentence = pattern.sub('', sentence)
    stemmer = Stemmer()
    s = [stemmer.stem(w) for w in sentence.split()]
    b = zip(*[s[i:] for i in [0, 1]])
    b = [bigram[0] + " " + bigram[1] for bigram in b]
    return s + b
示例#20
0
def getCleanedReview(review):
    review = review.replace('<br /><br />', " ")

    # Tokenization of text
    tokenizer = RegexpTokenizer(r'\w+')
    wordsList = tokenizer.tokenize(review)
    wordsList = [word.lower() for word in wordsList]

    # Removing stopwords
    sw = stopwords.words('english')
    sw = set(sw)
    wordsList = [word for word in wordsList if word not in sw]

    # Text stemming
    ps = PorterStemmer()
    wordsList = [ps.stem(word) for word in wordsList]
    # print(wordsList)

    # Return clean review
    cleaned_review = " ".join(wordsList)

    return cleaned_review
示例#21
0
 def __get_stemmer(self, stemmer, lang):
     """
     method (str): method for stemming, can be either snowball or porter
     """
     lang_dict = {"da": "danish", "en": "english"}
     if lang in lang_dict:
         lang = lang_dict[lang]
     else:
         raise ValueError(f"language {lang} not in language dict for\
             stemmer")
     if stemmer == "porter":
         ps = PorterStemmer()
         self.stemmer = ps.stem
     elif stemmer == "snowball":
         ss = SnowballStemmer(lang)
         self.stemmer = ss.stem
     elif not callable(self.stemmer):
         raise TypeError(f"stemmer should be a 'porter' or 'snowball' or\
                         callable not a type: {type(self.stemmer)}")
示例#22
0
class GigawordParser(StreamParser):
    STEMMERS = {
        "eng": PorterStemmer(ignore_stopwords=False),
        "spa": SpanishStemmer(),
    }

    def __init__(self, language):
        self.next_id = 0
        self.language = language
        self.stemmer = self.STEMMERS.get(language)
        if self.stemmer is None:
            raise Exception("Unsupported language %s" % language)

    def init_id_counter(self, initial):
        self.next_id = initial

    def new_id(self):
        new_id = self.next_id
        self.next_id += 1
        return new_id

    def parse_raw(self, xml_str):
        xml = minidom.parseString(xml_str)
        if self.language == "es":
            try:
                url = "gigaword:" + xml.getElementsByTagName(
                    "DOC")[0].attributes["id"].value
                title = xml.getElementsByTagName(
                    "HEADLINE")[0].firstChild.nodeValue
            except:
                url = "<NONE>"
                title = "<NONE>"
        else:
            url = "<NONE>"
            title = "<NONE>"
        text = stringio.StringIO()
        for node in xml.getElementsByTagName("TEXT")[0].childNodes:
            if len(node.childNodes) > 0:
                text.write(node.firstChild.nodeValue)
        content = text.getvalue()
        terms = text_to_terms(content, self.language)
        return RuwacDocument(self.new_id(), url, title, content, terms)
示例#23
0
def stemVector(vector, method="lemmatize"):
    output = []
    if method == 'lemmatize':
        wnl = WordNetLemmatizer()
        for i in vector:
            i = wnl.lemmatize(i)
            output.append(i)
    if method == 'snowball':
        st = EnglishStemmer()
        for i in vector:
            i = st.stem(i)
            output.append(i)
    if method == 'porter':
        st = PorterStemmer()
        for i in vector:
            i = st.stem(i)
            output.append(i)
    if method == 'lancaster':
        st = LancasterStemmer()
        for i in vector:
            i = st.stem(i)
            output.append(i)
    return output
示例#24
0
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import itertools

###############################################################################
### Fetch the dataset
###############################################################################
_20news = fetch_20newsgroups(subset="all")
print("Dataset 20NEWS loaded...")
data = _20news.data
target = _20news.target
###############################################################################
# Pre-process the dataset
###############################################################################
print("Pre-processing the dataset...")
stemmer = PorterStemmer()  # Define the type of stemmer to use
additional_stop_words = [
    'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization',
    'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks',
    'don', 'know', 'help', 'use', 'copy'
]
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words
                  ])  # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
示例#25
0
"""
TFIDF based baseline normalization.
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from ontology import read_ontology
import numpy
from write import write
from read import read
import settings
import sys

from nltk.stem.snowball import PorterStemmer
stemmer = PorterStemmer()
from nltk import word_tokenize

from baseline import build_tfidf
            
def normalize(in_path, out_path):
    print 'Reading concept data'
    concepts = list(read_ontology())
    
    concept_ids, concept_names, concept_map, concept_vectors, tfidf_vectorizer = build_tfidf(concepts)
    reverse_concept_map = {concept_ids[i]:concept_names[i] for i in range(len(concept_names))}
    print 'Making predictions'
    
    devel_data = read(in_path)
    
    devel_tuples = []
    for entity_id, data in devel_data.items():
示例#26
0
 def __init__(self):
     self.api_key = KEY
     self.tokenizer = nltk.WordPunctTokenizer()
     self.stm = PorterStemmer()
示例#27
0
class ApiClient(object):

    API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json"
    MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json"

    def __init__(self):
        self.api_key = KEY
        self.tokenizer = nltk.WordPunctTokenizer()
        self.stm = PorterStemmer()

    def _load(self, **kwargs):
        """
        Loads list of movies via filter
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.API_URL, params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def _load_movie(self, movie_id, **kwargs):
        """
        Loads extra movie information such as directors, genres, etc.
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.MOVIE_URL.format(str(movie_id)),
                                params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def normalize(self, text):
        tokens = list()
        for token in self.tokenizer.tokenize(text.lower()):

            # Excludes stopwords, punctuation; stemming
            if token in stopwords.words('english'):
                continue
            token = self.stm.stem(token)
            if token.isalpha():
                tokens.append(token)

        return tokens

    def get_extra_params(self, movie_id, movie):
        """
        Saves extra features of movie
        """
        m = self._load_movie(movie_id)
        if (m.has_key('genres') and m.has_key('runtime')
                and m.has_key('critics_consensus')
                and m.has_key('abridged_cast')
                and m.has_key('abridged_directors') and m.has_key('studio')):
            movie.genres = m.get("genres")
            movie.runtime = m.get("runtime")
            movie.critics_consensus = self.normalize(
                m.get("critics_consensus"))
            movie.abridged_cast_names = [
                ac['name'] for ac in m.get("abridged_cast")
            ]
            try:
                movie.first_director = m.get("abridged_directors")[0]['name']
            # This never happened: check type of exception
            except ValueError:
                return False
            movie.studio = m.get("studio")
            return True
        return False

    def search_movies(self, keyword, movie_ids, page_limit=50):
        #DBG
        logging.debug("Searching movies by keyword '%s'", keyword)

        # Get list of movies
        response = self._load(q=keyword, page_limit=1, page=1)
        n = response.get("total")

        # Load all 25 pages x 50 movies
        for i in xrange(min(n / page_limit, 25)):
            response = self._load(q=keyword, page_limit=page_limit, page=i + 1)
            if response:
                movies = response.get("movies")
                if movies:
                    for result in movies:
                        movie_id = result.get("id")
                        print movie_id

                        if not movie_id or movie_id in movie_ids:
                            continue
                        movie_ids.add(movie_id)

                        title = result.get("title")
                        synopsis = result.get("synopsis")
                        # Convert rating into linear scale [0-4]
                        rating = self.set_rating(result.get("mpaa_rating"))

                        if title and rating >= 0:
                            movie = Movie(movie_id, title)
                            if not synopsis:
                                movie.synopsis = ['EMPTY']
                            else:
                                movie.synopsis = self.normalize(synopsis)
                            movie.mpaa_rating = rating

                            # Load extra movie information
                            if self.get_extra_params(movie_id, movie):
                                yield movie

    @staticmethod
    def set_rating(rating):
        if rating == 'G':
            return 0
        elif rating == 'PG':
            return 1
        elif rating == 'PG-13':
            return 2
        elif rating == 'R':
            return 3
        elif rating == 'NC-17':
            return 4
        else:
            return -1
示例#28
0
from nltk.stem.snowball import PorterStemmer
import food_detection_root
import os
import codecs

stemmer = PorterStemmer()
path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
what_food_list_file = codecs.open(path + "list - what_food.txt",
                                  encoding='utf-8')
what_food_list = what_food_list_file.read().splitlines()
stemmed_list = list()
what_food_list_file.close()
for word in what_food_list:
    stemmed_word = stemmer.stem(word)
    stemmed_list.append(stemmed_word)
what_food_stemmed_list_file = codecs.open(path +
                                          "list - stemmed_what_food.txt",
                                          encoding='utf-8',
                                          mode='a')
for word in stemmed_list:
    what_food_stemmed_list_file.write(word + "\n")
what_food_stemmed_list_file.close()
def removepunct_tokenize_stem(text):
    text = "".join([ch for ch in text if ch not in string.punctuation]) #Remove punctuation
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    final = stem_tokens(tokens, stemmer)
    return final
示例#30
0
 def __init__(self):
     self.api_key = KEY
     self.tokenizer = nltk.WordPunctTokenizer()
     self.stm = PorterStemmer()
示例#31
0
class ApiClient(object):

    API_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies.json"
    MOVIE_URL = "http://api.rottentomatoes.com/api/public/v1.0/movies/{}.json"

    def __init__(self):
        self.api_key = KEY
        self.tokenizer = nltk.WordPunctTokenizer()
        self.stm = PorterStemmer()

    def _load(self, **kwargs):
        """
        Loads list of movies via filter
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.API_URL, params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def _load_movie(self, movie_id, **kwargs):
        """
        Loads extra movie information such as directors, genres, etc.
        """
        params = dict(kwargs)
        params["apikey"] = self.api_key
        response = requests.get(self.MOVIE_URL.format(str(movie_id)), params=params).json()
        if response and "Error" in response:
            raise ValueError(response.get("Error", "Unknown error"))
        else:
            return response

    def normalize(self, text):
        tokens = list()
        for token in self.tokenizer.tokenize(text.lower()):

            # Excludes stopwords, punctuation; stemming
            if token in stopwords.words('english'):
                continue
            token = self.stm.stem(token)
            if token.isalpha():
                tokens.append(token)

        return tokens

    def get_extra_params(self, movie_id, movie):
        """
        Saves extra features of movie
        """
        m = self._load_movie(movie_id)
        if (m.has_key('genres') and
                m.has_key('runtime') and
                m.has_key('critics_consensus') and
                m.has_key('abridged_cast') and
                m.has_key('abridged_directors') and
                m.has_key('studio')):
            movie.genres = m.get("genres")
            movie.runtime = m.get("runtime")
            movie.critics_consensus = self.normalize(m.get("critics_consensus"))
            movie.abridged_cast_names = [ac['name'] for ac in m.get("abridged_cast")]
            try:
                movie.first_director = m.get("abridged_directors")[0]['name']
            # This never happened: check type of exception
            except ValueError:
                return False
            movie.studio = m.get("studio")                        
            return True
        return False

    def search_movies(self, keyword, movie_ids, page_limit=50):
        #DBG
        logging.debug("Searching movies by keyword '%s'", keyword)

        # Get list of movies
        response = self._load(q=keyword, page_limit=1, page=1)
        n = response.get("total")

        # Load all 25 pages x 50 movies
        for i in xrange(min(n/page_limit, 25)):
            response = self._load(q=keyword, page_limit=page_limit, page=i+1)
            if response:
                movies = response.get("movies")
                if movies:
                    for result in movies:
                        movie_id = result.get("id")
                        print movie_id

                        if not movie_id or movie_id in movie_ids:
                            continue
                        movie_ids.add(movie_id)

                        title = result.get("title")
                        synopsis = result.get("synopsis")
                        # Convert rating into linear scale [0-4]
                        rating = self.set_rating(result.get("mpaa_rating"))

                        if title and rating >= 0:
                            movie = Movie(movie_id, title)
                            if not synopsis:
                                movie.synopsis = ['EMPTY']
                            else:
                                movie.synopsis = self.normalize(synopsis)
                            movie.mpaa_rating = rating

                            # Load extra movie information
                            if self.get_extra_params(movie_id, movie):
                                yield movie

    @staticmethod
    def set_rating(rating):
        if rating == 'G':
            return 0
        elif rating == 'PG':
            return 1
        elif rating == 'PG-13':
            return 2
        elif rating == 'R':
            return 3
        elif rating == 'NC-17':
            return 4
        else:
            return -1
示例#32
0
with open('data/dbpedia/test.csv', 'r', encoding='utf-8') as f:
    csv_file = csv.reader(f, delimiter=',')
    for row in csv_file:
        target.append(int(row[0])) # Class index
        data.append(row[2].encode('utf-8', 'ignore')) # Text description (ignore the entity name)
data = np.asarray(data)
target = np.asarray(target)
target = target - 1 # Labels starting from 0
print("Dataset DBPEDIA loaded...")
###############################################################################
### Pre-process the dataset
###############################################################################

print("Pre-processing the dataset...")
stemmer = PorterStemmer() # Define the type of stemmer to use
additional_stop_words = []
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []: # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
示例#33
0
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, PorterStemmer())
    return stems
    for key, value in dic.items():
        if value < 2:
            to_rm.append(key)
    pattern = re.compile(r'\b(' + r'|'.join(to_rm) + r')\b\s*')
    for i in range(len(lines)):
        if not (len(to_rm) == 0):
            lines[i] = pattern.sub('', lines[i])
    return lines

if __name__ == "__main__":
    not_stopw = ["no", "nor", "not", "over", "under", "again", "further",
            "but", "against", "too", "very"]
    stopw = stopwords.words('english')
    for x in not_stopw:
        stopw.remove(x)
    stemmer = Stemmer()
    pattern = re.compile(r'\b(' + r'|'.join(stopw) + r')\b\s*')
    script, fin, fout = sys.argv
    with open(fin, 'r') as f_in:
        lines = f_in.readlines()
        grades = []
        for i in range(len(lines)):
            line = lines[i].split("\t")
            grades.append(line[0])
            lines[i] = line[1].replace("\n", "")
            lines[i] = cls(lines[i])
        for i in range(len(lines)):
            lines[i] = lines[i].replace("n't", " not")
        for i in range(len(lines)):
            lines[i] = lines[i].lower()
            lines[i] = pattern.sub('', lines[i])