Exemplo n.º 1
0
def featurize(review):
	featurized_review = defaultdict(int)
	# bag_of_words(featurized_review, review)
	# bigrams(featurized_review, review)
	# stars(featurized_review, review)
	stem(featurized_review, review)
	# pos_tag(featurized_review, review)
	# preprocessed_bag_of_words(featurized_review, review)
	# service_vs_food(featurized_review, review)
	return featurized_review
Exemplo n.º 2
0
def format(line):#-------------------------------------------------------FORMAT
    stopwords = nltk.corpus.stopwords.words('english')      #list of stopwords
    useless = ["would", "could", "in", "use"];
    real = list() 
    listOfWords = []
    text = line["content"]
    
    
    
    text = text.str.strip('"')                                  #eliminate quotes
    text = text.str.split()                                     #cut stop words
    
    for word in text:
        for realWord in word:
            listOfWords.append(realWord.lower())
            
    real += [word for word in listOfWords if 
             word not in stopwords and 
             word not in useless]
    
    text = sorted(real)
    if(VERSION < 4):
        line[1] = lem(line[1])
        line[1] = stem(line[1]) 
    text = lem(text)
    return text
Exemplo n.º 3
0
def get_vocabulary():
    training_files = os.listdir(os.getcwd() + "/train/spam")
    for i in range(len(training_files)):
        training_files[i] = os.getcwd() + "/train/spam/" + training_files[i]

    training_ham = os.listdir(os.getcwd() + "/train/ham/")
    for i in range(len(training_ham)):
        training_ham[i] = os.getcwd() + "/train/ham/" + training_ham[i]

    training_files.extend(training_ham)

    v_word = list()

    for f_name in training_files:
        #print('a')
        if f_name.split('.')[-1] == "txt":
            #print('b')
            f = open(f_name, "r")
            #print(f_name)
            #print(f.read())
            lines = stem(f.read())
            words = set(lines)
            #print(words)
            for w in words:
                if not (w in v_word):
                    v_word.append(w)

    return v_word
Exemplo n.º 4
0
def train():
    data = {}
    total_spam = 0
    total_ham = 0
    nof_spam = 0
    nof_ham = 0
    for filename in os.listdir("./ham_train"):
        file = open('./ham_train' + '//' + filename, errors='ignore')
        wordcount = Counter(file.read().split())
        for item in wordcount.items():
            if item[0] in data:
                data[item[0]][1] += item[1]
            else:
                data[item[0]] = [0, item[1]]
            total_ham += item[1]
        nof_ham += 1
    for filename in os.listdir('./spam_train'):
        file = open('./spam_train' + '//' + filename, errors='ignore')
        wordcount = Counter(file.read().split())
        for item in wordcount.items():
            if item[0] in data:
                data[item[0]][0] += item[1]
            else:
                data[item[0]] = [item[1], 0]
            total_spam += item[1]
        nof_spam += 1
    data = stem(data)
    return data
def test():
    data = {}
    total_spam = 0
    total_ham = 0
    nof_spam = 0
    nof_ham = 0
    for filename in os.listdir('./ham_test'):
        file = open('./ham_test' + '//' + filename, errors='ignore')
        wordcount = Counter(file.read().split())
        for item in wordcount.items():
            if item[0] in data:
                data[item[0]][1] += item[1]
            else:
                data[item[0]] = [0, item[1]]
            total_ham += item[1]
        nof_ham += 1
    for filename in os.listdir('./spam_test'):
        file = open('./spam_test' + '//' + filename, errors='ignore')
        wordcount = Counter(file.read().split())
        for item in wordcount.items():
            if item[0] in data:
                data[item[0]][0] += item[1]
            else:
                data[item[0]] = [item[1], 0]
            total_spam += item[1]
        nof_spam += 1
    #print(nof_spam)
    data = stem(data)
    data_new, ps, ph = cond.Prob(data, total_spam, total_ham, nof_spam,
                                 nof_ham)
    return data_new, ps, ph
Exemplo n.º 6
0
def clean(link):
    """
    Apply every operations on a filename
    """
    text = get_Article(link)
    tokens = tokenize(text)
    stemmed_token = stem(tokens)
    big_words = particle_removal(stemmed_token)

    print(big_words)
Exemplo n.º 7
0
def stem(tokens, stemmer):
  if stemmer == 'porter':
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(i) for i in tokens]
  elif stemmer == 'porter2':
    tokens = [stem(i) for i in tokens]
  elif stemmer == 'lemma':
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(i) for i in tokens]

  return tokens
Exemplo n.º 8
0
def evaluate(m, f_name, train_flag, expected):

    #tokens = stem(open(f_name).read())

    text = stem(open(f_name, "r").read())

    tc = list()

    for p in string.punctuation:
        while p in text:
            text[text.index(p)] = ' '

    text = "".join("".join(text).split())

    summed = 0.0

    for i in range(len(m[0])):
        temp = text.count(m[0][i])

        summed = summed + (m[1][i] * temp)
        #if temp > 0.0:
        #print(temp)
        if train_flag:
            tc.append(temp)

    summed = summed + m[1][-1]

    #print("Sum: " + str(summed))

    #print(model[0][0])

    #print(str(spam_score) + str(ham_score))

    if train_flag:
        tc.append(1)

    result = 0

    if summed >= 0.0:
        result = 1
    else:
        result = -1

    if train_flag:
        for i in range(len(m[1])):
            #temp = m[1][i]
            m[1][i] = m[1][i] + (learning_rate * tc[i] * (expected - result))
            #if temp - m[1][i] < -0.0001:
            #    print("UP (" + str(temp) + ", " + str(m[1][i]) + ")")
            #elif temp - m[1][i] > 0.0001:
            #    print("DOWN (" + str(temp) + ", " + str(m[1][i]) + ")")
    #else:
    #print(summed)
    return result
Exemplo n.º 9
0
def pre_process(text):
    """
    Preprocesses a given document by removing apostrophes, tokenizing, which includes the removal of punctuation,
    lowercasing and stemming with the PorterStemmer.

    :param text: String The text which should be pre processed.
    :return:     String The pre processed text in the form a whitespace delimited String containing the processed words
                        in original order.
    """
    text = stem(remove_stop_words(to_lower(tokenize(
        replace_apostrophe(text)))))
    return ' '.join(text)
Exemplo n.º 10
0
def bag_of_words(phrase):
    sents = nltk.tokenize.sent_tokenize(phrase)
    words = []
    for sent in sents:
        words += nltk.tokenize.word_tokenize(sent)
    swords = unstop(stem(words))

    bag = {}
    for word in swords:
        if word in bag:
            bag[word] += 1
        else:
            bag[word] = 1
    return bag
Exemplo n.º 11
0
def useModel(modelPath, test):
    stemmed_no_stopwords_test = stem(removeStopwords(test))

    with open(modelPath, 'rb') as fid:
        model = pickle.load(fid)

    predicted = model.predict(stemmed_no_stopwords_test.data)

    print("My Best Configuration")

    print(
        metrics.classification_report(
            stemmed_no_stopwords_test.target,
            predicted,
            target_names=stemmed_no_stopwords_test.target_names))
    print("Macro Average F1: " + str(
        metrics.f1_score(
            stemmed_no_stopwords_test.target, predicted, average='macro')) +
          "\n")
Exemplo n.º 12
0
def readFiles(fileName):
    docs = {}
    f = open(fileName)
    count = 0
    for line in f.readlines():
        count += 1
        if count % 2 == 0:
            doc = nltk.word_tokenize(line.strip())
            if isLowerCase:
                doc = lowerCase(doc)
            if isStem:
                doc = stem(doc)
            if isRemoveStopWords:
                doc = removeStopWords(doc)
            if isRemovePunctuation:
                doc = removePunctuation(doc)
            if not isUnigram:
                doc = bigram(doc)
            docs[count / 2] = doc
    return docs
Exemplo n.º 13
0
def trainModel(training, name):
    stemmed_no_stopwords = stem(removeStopwords(training))

    # Removed Stopwords + Stemmed Training Data
    print("Removed Stopwords + Porter Stemmed")

    classifier = Pipeline([('vect', CountVectorizer()),
                           ('tfidf', TfidfTransformer()),
                           ('clf',
                            SGDClassifier(loss='hinge',
                                          penalty='l2',
                                          alpha=1e-3,
                                          n_iter=5,
                                          random_state=42))])

    classifier = classifier.fit(stemmed_no_stopwords.data,
                                stemmed_no_stopwords.target)

    # Save the classifier
    with open(name, 'wb') as fid:
        pickle.dump(classifier, fid)
Exemplo n.º 14
0
def preprocess(data):
    stemmer = PorterStemmer()
    wrds = stem(stemmer, data['text'].lower().split(" "))
    pol = data['pol']
    return (wrds, pol)
Exemplo n.º 15
0
    def train_clean(self, X, y):
        cleaned_x = []
        cleaned_y = []
        num_utter = len(X)
        token_mismatch_before = []
        token_mismatch_after = []
        blank_utterances = []

        for id in xrange(num_utter):
            utter_x = []
            utter_y = []
            utter_labels_tokens = y[id].split()
            utter_tokens = X[id].split()
            try:
                assert (len(utter_tokens) == len(utter_labels_tokens))
            except:
                token_mismatch_before.append(id + 1)
                continue
            for i in xrange(len(utter_labels_tokens)):
                if not ((utter_labels_tokens[i] in self.kick_labels) or
                        (utter_tokens[i] in self.kick_tokens)
                        or utter_tokens[i].isdigit()):
                    try:
                        utter_token = utter_tokens[i]
                        if self.stemm:
                            utter_token = stem(utter_token)
                        if self.lem:
                            wordnet_lemmatizer = WordNetLemmatizer()
                            utter_token = wordnet_lemmatizer.lemmatize(
                                utter_token)
                        try:
                            utter_token_label = self.label_dict[
                                utter_labels_tokens[i]]
                        except:
                            utter_token_label = self.failsafe_label
                        unicode_string = utter_token.decode("utf-8")
                        token_corrected = self.remove_accents(unicode_string)
                        utter_x.append(token_corrected)
                        utter_y.append(utter_token_label)
                    except:
                        continue
            sentence_x = (' '.join(utter_x))
            sentence_x = re.sub(r'[@#;,"().*!?:\/\\-]', '', sentence_x)
            sentence_x = re.sub(r'[_\']', '', sentence_x)
            tokens_x = sentence_x.split()
            tokens_y = utter_y
            if len(utter_x) == 0:
                blank_utterances.append(id + 1)
                continue
            if len(tokens_x) != len(tokens_y):
                token_mismatch_after.append(id + 1)
                continue
            cleaned_x.append(tokens_x)
            cleaned_y.append(tokens_y)
        print "Unicode errors...corrected\n" \
              "Token Mismatch Errors :...Skipped \n" \
              "%d before;%d after" % (len(token_mismatch_before),len(token_mismatch_after))
        self.log_mismatches(token_mismatch_before, token_mismatch_after,
                            blank_utterances)
        if self.correct_spellings:  # may have become buggy
            cleaned_x = [str(' '.join(x)) for x in cleaned_x]
            cleaned = [cleaned_x, cleaned_y]
            ets = engTextSeparate(cleaned)
            ets_cacs = ets.cacs()
            cleaned_x = ets_cacs
        return cleaned_x, cleaned_y  # list of list of tokens/labels
def ler_arquivo_clg():



    logging.info("Program started!")
    #iniciando array de palavras vs documents

    words_documents = []

    #lendo o arquivo com os leia e saida
    config = configparser.RawConfigParser(strict=False,dict_type=MultiOrderedDict)
    logging.info("Reading GLI.CFG")
    config.read(['GLI.CFG'])
    entradas = config.get("DEFAULT","LEIA");
    saida = config.get("DEFAULT", "ESCREVE");
    stemmer_config = config.get("DEFAULT", "STEMMER");

    if (stemmer_config[0] == 'true'):
        stemmer = 1
    else:
        stemmer = 0

    logging.info("GLI.CFG has been read")


    logging.info("Reading cfc-2.dtd")

    # parte de ler o xml usando o dtd
    f = codecs.open('db\cfc-2.dtd')
    dtd = ET.DTD(f)

    logging.info("cfc-2.dtd read")



    logging.info("Starting reading xml")

    begin_time = time.perf_counter()
    for entrada in entradas:
        #print("printando a entrada " + entrada)
        logging.info("Reading " + entrada + " xml file")
        root = ET.parse(entrada)
        if(dtd.validate(root)):
            xmldoc = minidom.parse(entrada)
            itemlist = xmldoc.getElementsByTagName('RECORD')
            for s in itemlist:
                recordnum = s.getElementsByTagName('RECORDNUM')
                recordnum =  int(recordnum[0].firstChild.nodeValue)
                abstract = s.getElementsByTagName('ABSTRACT')
                if(len(abstract) > 0):
                    text_to_parse = abstract[0].firstChild.nodeValue
                else:
                    extract = s.getElementsByTagName('EXTRACT')
                    if(len(extract) > 0):
                        text_to_parse = extract[0].firstChild.nodeValue
                    else:
                        continue

                text_to_parse = text_to_parse.upper()
                text_to_parse = re.sub('[^A-Z\ \']+', " ", text_to_parse)
                text_words = text_to_parse.split()

                for word in text_words:
                    word_found = False
                    for wd in words_documents:
                        if (stemmer == 0):
                            if (wd.word == word):
                                wd.documents.append(recordnum)
                                word_found = True
                                break
                        else:
                            if (wd.word == nltk.stem(word)):
                                wd.documents.append(recordnum)
                                word_found = True
                                break
                    if (word_found == False):
                        if (stemmer == 0):
                            w = word_document(word)
                            w.documents.append(recordnum)
                            words_documents.append(w)
                        else:
                            w = word_document(nltk.stem(word))
                            w.documents.append(recordnum)
                            words_documents.append(w)

                #print(s.attributes['RECORDNUM'].value)
        else:
            logging.info(entrada + " xml file didn't pass on dtd validation")

            #print(dtd.error_log.filter_from_errors())

    end_time = time.perf_counter()
    total_time = end_time - begin_time

    logging.info("Inverted list created a list with " + str(len(words_documents)) + " words")
    logging.info("Inverted list made " + str(len(words_documents) / total_time) + " words per second")
    logging.info("Inverted list made " + str(len(entradas) / total_time) + " documents per second")

    logging.info("Writing on csv")

    with open(saida[0], 'w',newline='') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=';',
                                quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for wd in words_documents:
            spamwriter.writerow([wd.word,wd.documents])
    logging.info("Finished!")
Exemplo n.º 17
0
            # Use BeautifulSoup to parse
            soup = BeautifulSoup(inf, 'html.parser')

            # Get the Body
            body = get_body(soup)
            # print 'Body: ', body
            # Get the Title
            title = get_title(soup)
            # print 'Title: ', title
            # Get the Headers
            headers = get_headers(soup)
            # Get the Bold
            bolds = get_bold(soup)

            # Tokenize via NLTK
            title_tokens = stem(tokenize(title))
            if not title_tokens:
                title_tokens = list()
            body_tokens = stem(tokenize(body))
            if not body_tokens:
                body_tokens = list()
            header_tokens = stem(tokenize(headers))
            if not header_tokens:
                header_tokens = list()
            bold_tokens = stem(tokenize(bolds))
            if not bold_tokens:
                bold_tokens = list()

            all_tokens = clean_up(remove_unwanted(title_tokens + body_tokens))
            strong_tokens = clean_up(
                remove_unwanted(title_tokens + header_tokens + bold_tokens))
Exemplo n.º 18
0
def getDiseaseFromSymptom(message, number):

    user_input = message
    letters_only = re.sub("[^a-zA-Z]", " ", user_input)
    lower_case = letters_only.lower()
    words = lower_case.split()
    words = [w for w in words if not w in stopwords.words("english")]
    stemmed_words = [stem(word) for word in words]

    val = getDiseaseFromLocalValues(stemmed_words, number)
    if val != "":
        return val

    symptoms_having_ids = [
        'dizzi', 'weight', 'tired', 'feel', 'heartburn', 'back', 'menstruat',
        'paralysi', 'skin', 'stomach', 'cold', 'miss', 'sleepless', 'eye',
        'droop', 'earach', 'memori', 'nervous', 'hot', 'chest', 'lip',
        'nausea', 'earli', 'headach', 'fever', 'reduc', 'itch', 'swollen',
        'burn', 'weak', 'stuffi', 'sneez', 'sore', 'hiccup', 'vomit', 'wheez',
        'fast,', 'increas', 'tremor', 'cough', 'runni', 'chill', 'palpit',
        'short', 'neck', 'sputum', 'tear', 'abdomin', 'cheek', 'dri',
        'anxieti', 'sweat', 'night', 'unconsciousness,'
    ]

    symptom_to_id = {
        'dizzi': 207,
        'weight': 23,
        'tired': 16,
        'feel': 76,
        'heartburn': 45,
        'back': 104,
        'menstruat': 112,
        'paralysi': 140,
        'skin': 124,
        'stomach': 179,
        'sweat': 139,
        'sleep': 52,
        'eye': 33,
        'droop': 244,
        'earach': 87,
        'memori': 235,
        'nervous': 114,
        'chest': 17,
        'lip': 35,
        'nausea': 44,
        'earli': 92,
        'headach': 9,
        'fever': 11,
        'appetit': 54,
        'itch': 96,
        'swollen': 169,
        'burn': 46,
        'weak': 56,
        'stuffi': 28,
        'sneez': 95,
        'sore': 13,
        'hiccup': 122,
        'vomit': 181,
        'wheez': 30,
        'thirst': 40,
        'tremor': 115,
        'cough': 15,
        'runni': 14,
        'chill': 175,
        'palpit': 37,
        'neck': 136,
        'sputum': 64,
        'tear': 211,
        'abdomin': 10,
        'cheek': 170,
        'dri': 273,
        'anxieti': 238,
        'sweat': 138,
        'night': 133,
        'unconsciousness,': 144
    }

    the_real_symptoms_with_ids = list(
        set(symptoms_having_ids).intersection(stemmed_words))
    print(the_real_symptoms_with_ids)
    ids = []
    for i in the_real_symptoms_with_ids:
        ids.append(str(symptom_to_id[i]))
    return getPotentialDiseasesFromIds(ids, number)
Exemplo n.º 19
0
def tokenize_and_stem(text, stemmer="lemma", is_english_word=None):
  if is_english_word == None:
    is_english_word = load_from_dictionary("english")

  tokenizer = nltk.RegexpTokenizer(r'\w+')
  return stem(tokenize(text, is_english_word), stemmer)
Exemplo n.º 20
0
def explorationResults(training, test):
    # Process training data
    no_stopwords = removeStopwords(training)
    stemmed = stem(training)
    stemmed_no_stopwords = stem(removeStopwords(training))

    # Process test data
    no_stopwords_test = removeStopwords(test)
    stemmed_test = stem(test)
    stemmed_no_stopwords_test = stem(removeStopwords(test))

    # Unigram Baseline
    print("Unigram Baseline")
    classify(
        training, test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Removed Stopwords Training Data
    print("Removed Stopwords")
    classify(
        no_stopwords, no_stopwords_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Stemmed Training Data
    print("Porter Stemmed")
    classify(
        stemmed, stemmed_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Removed Stopwords + Stemmed Training Data
    print("Removed Stopwords + Porter Stemmed")
    classify(
        stemmed_no_stopwords, stemmed_no_stopwords_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Univariate Feature Selection
    print("Univariate Feature Selection")
    classify(
        training, test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectPercentile(chi2, 25)),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # L2 Regularization
    print("L2 Regularization")
    t = copy.copy(training)
    classify(
        training, test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectFromModel(LinearSVC(penalty="l2"))),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # L2 Regularization + Univariate Feature Selection
    print("L2 Regularization + Univariate Feature Selection")
    t = copy.copy(training)
    classify(
        training, test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('univariate', SelectFromModel(LinearSVC(penalty="l2"))),
                  ('L2', SelectPercentile(chi2, 25)),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Univariate Feature Selection + L2 Regularization
    print("Univariate Feature Selection + L2 Regularization")
    t = copy.copy(training)
    classify(
        training, test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('L2', SelectPercentile(chi2, 25)),
                  ('univariate', SelectFromModel(LinearSVC(penalty="l2"))),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Removed Stopwords + Univariate Feature Selection
    print("Removed Stopwords + Univariate Feature Selection")
    classify(
        no_stopwords, no_stopwords_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectPercentile(chi2, 25)),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Removed Stopwords + L2 Regularization
    print("Removed Stopwords + L2 Regularization")
    classify(
        no_stopwords, no_stopwords_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectFromModel(LinearSVC(penalty="l2"))),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Stemmed Training Data + Univariate Feature Selection
    print("Porter Stemmed + Univariate Feature Selection")
    classify(
        stemmed, stemmed_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectPercentile(chi2, 25)),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Stemmed Training Data + L2 Regularization
    print("Porter Stemmed + L2 Regularization")
    classify(
        stemmed, stemmed_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectFromModel(LinearSVC(penalty="l2"))),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])

    # Stemmed Training Data + Removed Stopwords + Univariate Feature Selection
    print("Porter Stemmed + Removed Stopwords + Univariate Feature Selection")
    classify(
        stemmed_no_stopwords, stemmed_no_stopwords_test,
        Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                  ('selector', SelectPercentile(chi2, 25)),
                  ('clf',
                   SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 alpha=1e-3,
                                 n_iter=5,
                                 random_state=42))]), [])