def f_ngram_pos(train, test, mode='tfidf', binary=1, ngram=(1, 1), min_c=1):
    '''exact n-gram feacturs based on word property information
    return: feature array (fgram); feature vocabulary (vocab)
    input: raw train and test data; ngram = (n,n), denote n_gram;
           tokens with count below min_c are cut off.
    '''
    if mode == 'tfidf':
        if binary == 1:
            gram = text.TfidfVectorizer(ngram_range=ngram,
                                        binary=True,
                                        min_df=min_c)
        else:
            gram = text.TfidfVectorizer(ngram_range=ngram, min_df=min_c)
    else:  # mode=count
        if binary == 1:
            gram = text.CountVectorizer(ngram_range=ngram,
                                        binary=True,
                                        min_df=min_c)
        else:
            gram = text.CountVectorizer(ngram_range=ngram, min_df=min_c)
    train = dataFilter_train(train)
    train_pos = data_Pos(train)
    gram.fit(train_pos)
    vocab = gram.get_feature_names()
    fgram_train = gram.transform(train_pos).toarray()
    test = dataFilter_train(test)
    test_pos = data_Pos(test)
    fgram_test = gram.transform(test_pos).toarray()
    return (fgram_train, fgram_test, vocab)
def f_ngram(data, mode='tfidf', binary=1, ngram=(1, 1), min_c=1):
    '''exact n-gram feacturs
    return: feature array (fgram); feature vocabulary (vocab)
    input: data; ngram = (n,n) denote n_gram and ngram=(1,2) denote
    1_gram and 2_gram; tokens with count below min_c are cut off. 
    '''
    if mode == 'tfidf':
        if binary == 1:
            gram = text.TfidfVectorizer(ngram_range=ngram,
                                        binary=True,
                                        min_df=min_c)
        else:
            gram = text.TfidfVectorizer(ngram_range=ngram, min_df=min_c)

    else:  #mode=count
        if binary == 1:
            gram = text.CountVectorizer(ngram_range=ngram,
                                        binary=True,
                                        min_df=min_c)
        else:
            gram = text.CountVectorizer(ngram_range=ngram, min_df=min_c)
    gram = gram.fit(data)
    vocab = gram.get_feature_names()
    fgram = gram.transform(data).toarray()
    return (fgram, vocab)
示例#3
0
def extract_features(features_train, features_test, post_train, post_test,
                     cap_train, cap_test, title_train, title_test, low_train,
                     low_test, digit_tr, digit_test, con_train, con_test,
                     back_train, back_test, front_train, front_test):
    vectorizer_1 = text.CountVectorizer(ngram_range=(1, 1))
    vectorizer_2 = text.CountVectorizer(ngram_range=(1, 1))
    vectorizer_3 = text.CountVectorizer(ngram_range=(1, 1))
    vectorizer_4 = text.CountVectorizer(ngram_range=(1, 1))
    vectorizer_5 = text.CountVectorizer(ngram_range=(1, 1))
    training_vector = vectorizer_1.fit_transform(features_train)
    test_vector = vectorizer_1.transform(features_test)
    ptraining_vector = vectorizer_2.fit_transform(post_train)
    ptest_vector = vectorizer_2.transform(post_test)
    context_train = vectorizer_3.fit_transform(con_train)
    context_test = vectorizer_3.transform(con_test)
    bck_train = vectorizer_4.fit_transform(back_train)
    bck_test = vectorizer_4.transform(back_test)
    frnt_train = vectorizer_5.fit_transform(front_train)
    frnt_test = vectorizer_5.transform(front_test)
    training_vec = sp.sparse.hstack(
        (training_vector, context_train, bck_train, frnt_train,
         ptraining_vector, csr_matrix(cap_train).T, csr_matrix(title_train).T,
         csr_matrix(low_train).T, csr_matrix(digit_tr).T))
    test_vect = sp.sparse.hstack(
        (test_vector, context_test, bck_test, frnt_test, ptest_vector,
         csr_matrix(cap_test).T, csr_matrix(title_test).T,
         csr_matrix(low_test).T, csr_matrix(digit_test).T))
    return training_vec, test_vect
示例#4
0
def bag_of_words(tr_tweets,
                 te_tweets,
                 tr_targets=pd.Series(),
                 te_targets=pd.Series(),
                 per_target=False,
                 max_feats=None,
                 normalise_counts=False,
                 **kwargs):
    """
    Calculate bag-of-words representations of train and test tweets
    :param tr_tweets: pandas Series of strings, raw texts to convert (from train set)
    :param te_tweets: pandas Series of strings, raw texts to convert (from test set)
    :param tr_targets: pandas Series of strings, target classes (from train set)
    :param te_targets: pandas Series of strings, target classes (from test set)
    :param per_target: bool, whether to find separate BoW repr for each target class
    :param max_feats: int, maximum number of words/ngrams to keep, number of dimensions
    in returned feature matrices
    :param normalise_counts: bool, whether to divide the counts within each tweet by the
    number of tokens (not for Multinomial NB)
    :param kwargs: to be passed onto sklearn CountVectorizer
    :return: tuple, training feature matrix, test feature matrix, list of feature names
    (with '_bow' appended to each)
    """

    if per_target and not tr_targets.empty and not te_targets.empty:
        # Create different BoW for each target
        # Only useful if using max_features - as most common words/n-grams
        # May be for only one or two of the targets
        x_tr = np.zeros((tr_tweets.shape[0], max_feats), dtype=np.int64)
        x_te = np.zeros((te_tweets.shape[0], max_feats), dtype=np.int64)
        for _targ in tr_targets.unique():
            word_bagger = text_sk.CountVectorizer(max_features=max_feats,
                                                  **kwargs)
            x_tr[(tr_targets == _targ).values] = \
                word_bagger.fit_transform(tr_tweets[(tr_targets == _targ).values].values).toarray()
            x_te[(te_targets == _targ).values] = \
                word_bagger.transform(te_tweets[(te_targets == _targ).values].values).toarray()
    else:
        word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs)
        x_tr = word_bagger.fit_transform(tr_tweets).toarray()
        x_te = word_bagger.transform(te_tweets).toarray()

    if normalise_counts:
        # Normliase counts by length of tweet
        tr_tweet_lens = tr_tweets.apply(
            tokenize.TweetTokenizer().tokenize).apply(len)
        te_tweet_lens = te_tweets.apply(
            tokenize.TweetTokenizer().tokenize).apply(len)
        x_tr = np.divide(x_tr, tr_tweet_lens.values[:, np.newaxis])
        x_te = np.divide(x_te, te_tweet_lens.values[:, np.newaxis])
    return x_tr, x_te, [
        _fn + '_bow' for _fn in word_bagger.get_feature_names()
    ]
示例#5
0
文件: crf.py 项目: wingjammer1993/IOB
def extract_features(features_train, features_test, post_train, post_test, cap_train, cap_test,
                     title_train, title_test, low_train, low_test, digit_tr, digit_test):
    vectorizer_1 = text.CountVectorizer(ngram_range=(1, 1))
    vectorizer_2 = text.CountVectorizer(ngram_range=(1, 1))
    training_vector = vectorizer_1.fit_transform(features_train)
    test_vector = vectorizer_1.transform(features_test)
    ptraining_vector = vectorizer_2.fit_transform(post_train)
    ptest_vector = vectorizer_2.transform(post_test)
    training_vec = sp.sparse.hstack((training_vector, ptraining_vector, csr_matrix(cap_train).T,
                                     csr_matrix(title_train).T, csr_matrix(low_train).T, csr_matrix(digit_tr).T))
    test_vect = sp.sparse.hstack((test_vector, ptest_vector, csr_matrix(cap_test).T,
                                  csr_matrix(title_test).T, csr_matrix(low_test).T, csr_matrix(digit_test).T))
    return training_vec, test_vect
def get_test_doc_counts(input_seq_fname, vocab=None):
    lines = []
    n_tokens = []
    with open(input_seq_fname) as f:
            lines = [line for line in f]

    if vocab is None:
        cv = text.CountVectorizer()
    else:
        cv = text.CountVectorizer(vocabulary=vocab)
    doc_matrix = cv.fit_transform(lines)
    n_tokens = np.array(doc_matrix.sum(axis=1))
    return n_tokens
示例#7
0
 def __init__(self,
              n_clusters=50,
              pca_n_components=20,
              kmpca_n_components=3,
              kernel_n_components=30):
     self.counter = text.CountVectorizer(stop_words='english',
                                         ngram_range=(1, 2),
                                         min_df=30,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX',
         'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX'
     ]
     self.linear_feature_selector = None
 def __init__(self,
              n_clusters=50,
              pca_n_components=30,
              kmpca_n_components=3,
              kernel_n_components=30):
     ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection
     ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering
     self.counter = text.CountVectorizer(stop_words='english',
                                         charset='utf-8',
                                         charset_error='ignore',
                                         ngram_range=(1, 1),
                                         min_df=0.001,
                                         max_df=0.05,
                                         binary=True,
                                         lowercase=True)
     self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters,
                                       n_init=10,
                                       batch_size=10000,
                                       verbose=1)
     self.pca = decomposition.RandomizedPCA(n_components=pca_n_components)
     self.kmpca = decomposition.RandomizedPCA(
         n_components=kmpca_n_components)
     self.rbf = kernel_approximation.RBFSampler(
         n_components=kernel_n_components)
     self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30,
                                                      max_depth=5,
                                                      n_jobs=4)
     self.X_names = [
         'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX',
         'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX'
     ]
     self.linear_feature_selector = None
示例#9
0
def parseLogs(inputFile, outputFile):
    vectorizer = ext.CountVectorizer(tokenizer=get_tokens,
                                     stop_words='english')
    with open(inputFile) as file:
        lines = [line.rstrip() for line in file]
    lineNos = dict(zip(range(1, len(lines)), lines))
    doc_matrix = vectorizer.fit_transform(lines)

    tf_idf_transformer = ext.TfidfTransformer().fit(doc_matrix)
    sparse = tf_idf_transformer.transform(doc_matrix).toarray()

    perLineScore = []
    for row in sparse:
        perLineScore.append(row.sum() / len(row.nonzero()[0]))

    lineScores = dict(zip(range(1, len(lines)), perLineScore))

    df = pd.DataFrame([lineNos, lineScores]).T
    df.columns = ['d{}'.format(i) for i, col in enumerate(df, 1)]
    df = df.sort_values(by=['d2'], ascending=False)

    with open(outputFile, 'w') as outFile:
        for index, row in df.iterrows():
            line = "{0:0=3d}  {1}\n"
            outFile.write(line.format(index, row['d1']))
def comparison_test(text):
    import sklearn.feature_extraction.text as txt
    h_trick = txt.HashingVectorizer(n_features=20, binary=True, norm=None)
    oh_encoder = txt.CountVectorizer()
    oh_encoded = oh_encoder.fit_transform(text)
    hashing = h_trick.transform(text)
    return oh_encoded, hashing
示例#11
0
def create_disaster_sequence(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting Data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Tokenizing and count vectorizing...')
    vect = st.CountVectorizer(tokenizer=lambda message: (
        pt.pipe
        | __normalize_text__
        | __tokenize_text__
        | __remove_stopwords__
        # | __stem_text__
        | __lemmatize_text__)(message))

    print('Tfidf transforming...')
    tfidf = st.TfidfTransformer()
    classifier = en.RandomForestClassifier()

    print('Fitting classifier on train...')
    x_train_counts = vect.fit_transform(x_train)
    x_train_tfidf = tfidf.fit_transform(x_train_counts)
    classifier.fit(x_train_tfidf, y_train)

    print('Running classifier on test...')
    x_test_counts = vect.transform(x_test)
    x_test_tfidf = tfidf.transform(x_test_counts)
    y_pred = classifier.predict(x_test_tfidf)

    print('Displaying results...')
    display_results(y_test, y_pred)
def build_model(X_train, X_test, y_train, y_test):
    """Build and evaluate a model. Also returns the test-set predictions."""
    count_vect = sktext.CountVectorizer()
    tfidf_transformer = sktext.TfidfTransformer()

    X_train_counts = count_vect.fit_transform(X_train)
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_test_counts = count_vect.transform(X_test)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    X_train, X_test = feature_selection(X_train_tfidf, y_train, X_test_tfidf,
                                        'all')

    model = svm.SVC(C=10, kernel='linear')
    # model = dummy.DummyClassifier(strategy="stratified")
    model.fit(X_train, y_train)
    LOGGER.info("Model trained")

    # Test trained model:
    predicted = model.predict(X_test)
    df_out = pd.DataFrame(y_test)
    df_out['pred'] = predicted
    df_out['target'] = y_test
    df_out['match'] = df_out['pred'] == df_out['target']

    classifier = {
        "model": model,
        "counter": count_vect,
        "transformer": tfidf_transformer
    }
    return (classifier, df_out)
示例#13
0
    def __init__(self):
        ## NOTHING TODO
        # load emails
        x = open('emails.txt').read()
        emails = json.loads(x)

        # get previous spam emails (spam), non spam emails (not_spam), unclassified input mails (to_classify)
        spam = emails["spam"]
        not_spam = emails["not_spam"]
        to_classify = emails["to_classify"]

        # Number of emails
        n_spam = len(spam)
        n_not_spam = len(not_spam)
        n_to_classify = len(to_classify)
        ''' To ignore certain common words in English that might skew your model, we add them to the stop words 
         list below. You may want to experiment by choosing your own list of stop words, 
         but be sure to keep subject in this list at a minimum, as it appears in every email content.'''
        stop_words = text.ENGLISH_STOP_WORDS.union({'subject'})

        # Form bag of words model using words used at least 10 times
        vectorizer = text.CountVectorizer(stop_words=stop_words, min_df=10)
        X = vectorizer.fit_transform(spam + not_spam + to_classify).toarray()

        # split word counts into separate matrices
        self.X_spam, self.X_not_spam, self.X_to_classify = X[:n_spam, :], X[
            n_spam:n_spam + n_not_spam, :], X[n_spam + n_not_spam:, :]
def split_docs_by_repeated(input_seq_fname):
    """Take input doc files and find which ones are vs. are not repeated,
    as well as gathering information about the unigram language model of
    the repeated documents.

    Input:
        input_seq_fname: the path to a text file of Mallet-formatted data
        for repeats.

    Outputs:
        repeated_documents: list of integer ids of repeated documents
          *the union of these two should be range(25000)
        n_tokens: the length of each document in tokens
        doc_models: an array of word frequencies for each repeated document
        cv.vocabulary_: a map of vocabulary word to id
    """
    repeated_docs = []
    repeated_lines = []
    n_tokens = []
    repeats_mask = []
    with open(input_seq_fname) as f:
        for line in f:
            doc_id, is_repeat, doc = line.split('\t')
            n_tokens.append(len(doc.split()))
            repeats_mask.append(is_repeat == 'True')
            _, original_id, line_id = doc_id.split('-')
            if is_repeat == 'True':
                repeated_lines.append(doc)
                repeated_docs.append(int(original_id))

    cv = text.CountVectorizer()
    doc_models = cv.fit_transform(repeated_lines)
    n_tokens = np.array(n_tokens, dtype=int)
    repeats_mask = np.array(repeats_mask, dtype=bool)
    return repeated_docs, n_tokens, repeats_mask, doc_models, cv.vocabulary_
def BOW_vectors(classifying_docs, do_PCA, num_PCA):

    print "Beginning to train BOW..."
    start = time.clock()

    classifying_sentences = [" ".join(doc.words) for doc in classifying_docs]

    count_vectorizer = fet.CountVectorizer(
        analyzer="word",
        lowercase=True,
        binary=True,
        stop_words=None,
        tokenizer=TreebankWordTokenizer().tokenize)
    BOW_classifying = count_vectorizer.fit_transform(classifying_sentences)

    if do_PCA == True:
        u, s, _ = svds(A=csr_matrix.transpose(BOW_classifying.asfptype()),
                       k=num_PCA,
                       which="LM",
                       return_singular_vectors=True)
        classifying_vectors = projected_BOW(BOW_classifying, u, s)
    else:
        classifying_vectors = BOW_classifying

        print "BOW matrix has %d unique terms" % (BOW_classifying.shape[1])

    classifying_labels = [doc.sentiment for doc in classifying_docs]

    end = time.clock()
    print "Ending BOW training... took %f seconds" % (end - start)

    return classifying_vectors, classifying_labels
示例#16
0
 def __init__(self, cat_paths):
     ##в TfidfVectorizer нужно добавить аргументом массив со
     ##stopwords, чтобы лучше работало
     rus_dict = csv_parse('freqrnc2011.csv')
     stop_pos = {
         'conj', 'anum', 'intj', 'advpro', 'spro', 'apro', 'pr', 'num',
         'part'
     }
     stopwords = [i['Lemma'] for i in rus_dict if i['PoS'] in stop_pos]
     stopwords += ['vk', 'com', 'https', 'photo']
     self.vectorizer = sktext.TfidfVectorizer(input='filename',
                                              stop_words=stopwords)
     self.alltexts_dict = dict()
     ##складываем имена файлов в словарь по категориям:
     for i in cat_paths:
         self.alltexts_dict[i.split('/')[-2]] = [
             i + j for j in os.listdir(i)
             if re.search('\w',
                          open(i +
                               j, 'r', encoding='utf-8').read()) is not None
         ]
     self.alltexts_list = []
     for k in self.alltexts_dict:
         self.alltexts_list += self.alltexts_dict[k]
     self.vectorizer.fit(self.alltexts_list)
     self.count_vectorizer = sktext.CountVectorizer(
         stop_words=stopwords, vocabulary=self.vectorizer.vocabulary_)
     self.features = self.vectorizer.get_feature_names()
     self.alltext = get_one_text(self.alltexts_list)
     self.counts_alltext = np.array(
         self.count_vectorizer.transform([self.alltext]).mean(axis=0))[0]
示例#17
0
def get_data():
    df = pd.read_table('SMSSpamCollection',
                       sep='\t',
                       header=None,
                       names=['label', 'sms_message'])

    df['label'] = df.label.map({'ham': 0, 'spam': 1})

    X_train, X_test, y_train, y_test = train_test_split(df['sms_message'],
                                                        df['label'],
                                                        random_state=1,
                                                        test_size=0.1)

    count_vector = text.CountVectorizer(ngram_range=[1, 4], analyzer='char_wb')

    # Fit the training data and then return the matrix

    training_data = count_vector.fit_transform(X_train)
    testing_data = count_vector.transform(X_test)

    # NEw layer of tf-idf for better model

    transformer = text.TfidfTransformer()
    training_data = transformer.fit_transform(training_data)
    testing_data = transformer.transform(testing_data)

    return (training_data, y_train), (testing_data,
                                      y_test), count_vector, transformer
示例#18
0
def NMF(arquivo, nt):
	vector = text.CountVectorizer(input='arquivo', stop_words='english', min_df=1, strip_accents='unicode')
	arqArray = vector.fit_transform(arquivo).toarray()
	vocabulario = np.array(vector.get_feature_names())
	
	ntw = arqArray.shape[0]
	
	#NMF
	num_topics = nt
	num_top_words = ntw
	#Decomposição
	clf = decomposition.NMF(n_components=num_topics, random_state=1)
	doctopic = clf.fit_transform(arqArray)

	topic_words = []
	for topic in clf.components_:
		word_idx = np.argsort(topic)[::-1][0:num_top_words]
		topic_words.append([vocabulario[i] for i in word_idx])
		
	with np.errstate(invalid='ignore'):
		doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)
	
	
	#print(doctopic)

	FechaDocumento(arquivo)

	return(doctopic)
示例#19
0
def fromAllDocsToBow(all_docs,
                     strip_accents=u'ascii',
                     lowercase=True,
                     preprocessor=None,
                     stop_words=None,
                     token_pattern=u"[\\w']+\\w\\b",
                     analyzer=u'word',
                     max_df=1.0,
                     max_features=20000,
                     vocabulary=None,
                     binary=False,
                     ngram_range=(1, 1),
                     min_df=1,
                     normalize=True):
    """ Depuis un liste de documents, génère une matrice sparse contenant les occurences des mots.
        A chaque mot est associé un identifiant grace à une table de hashage.
    """
    vec_param = txt.CountVectorizer(all_docs,
                                    strip_accents=strip_accents,
                                    lowercase=lowercase,
                                    preprocessor=preprocessor,
                                    stop_words=stop_words,
                                    token_pattern=token_pattern,
                                    analyzer=analyzer,
                                    max_df=max_df,
                                    max_features=max_features,
                                    vocabulary=vocabulary,
                                    binary=binary,
                                    ngram_range=ngram_range,
                                    min_df=min_df)
    bow = fromVectoBow(all_docs, vec_param, normalize)
    return bow, vec_param
示例#20
0
def replace_one_words(sentences):

    #all_new_list = train_sentences + test_sentences + validation_sentences
    all_new_list = sentences

    count_vectorizer = fet.CountVectorizer(
        analyzer="word",
        lowercase=False,
        binary=True,
        stop_words=None,
        tokenizer=TreebankWordTokenizer().tokenize)

    X = count_vectorizer.fit_transform(all_new_list)
    feature_names = count_vectorizer.get_feature_names()

    word_counts = np.array(X.sum(0))[0]
    one_indices = np.where(word_counts == 1)[0]
    one_words = [feature_names[ind] for ind in one_indices]

    for ind in range(len(one_indices)):
        one_word_column = X[:, one_indices[ind]]
        one_word_sentence_ind = one_word_column.nonzero()[0].tolist()[0]
        all_new_list[one_word_sentence_ind] = all_new_list[
            one_word_sentence_ind].replace(" " + one_words[ind] + " ",
                                           " UNKNOWN_WORD ")

    return "\n".join(all_new_list)
示例#21
0
def build_vocab(train_data, test_data):
  """
  Dataset Dictionary Builder

  Vectorizers and builds the combined dictionary of the test and training datasets.

  :param train_data: DataFrame of the training data
  :type train_data: pd.DataFrame

  :param test_data: DataFrame of the test data
  :type test_data: pd.DataFrame

  :return: Combined vocabulary for the training and testing.
  :rtype: dict
  """
  # Token pattern is used to allow the vectorizer to include one letter words (e.g., "a", "i")
  vectorizer = sklearntext.CountVectorizer(lowercase=True, token_pattern=r"[\b|\w*#]*\w*[!+#*\w*|\b]*")
  words = set()
  for data_pd in [train_data, test_data]:
    vectorizer.fit_transform(data_pd[const.COL_TWEET])
    vocab = vectorizer.vocabulary_
    words |= set(vocab.keys())
  vocab = {}
  for idx, word in enumerate(sorted(words)):
    vocab[word] = idx
  return vocab
示例#22
0
def create_disaster_pipeline(disaster_csv_path, category_name):

    disaster = ut.read_csv(disaster_csv_path)

    print('Getting data...')
    X = disaster['message'].values
    Y = disaster[category_name].values
    x_train, x_test, y_train, y_test = ms.train_test_split(X, Y, test_size=0.3)

    print('Creating pipeline...')
    pipeline = pi.Pipeline([
        ('vect',
         st.CountVectorizer(
             tokenizer=lambda text: (pt.pipe
                                     | __normalize_text__
                                     | __tokenize_text__
                                     | __remove_stopwords__
                                     | __lemmatize_text__)(text))),
        ('tfidf', st.TfidfTransformer()), ('clf', en.RandomForestClassifier())
    ])

    print('Fitting pipeline...')
    pipeline.fit(x_train, y_train)

    print('Predicting with pipeline...')
    y_pred = pipeline.predict(x_test)

    print('Displaying results...')
    display_results(y_test, y_pred)

    pass
    def _vectorizeProducts(self):
        prodvectorizer = text.CountVectorizer()
        prodvectorize = self._vectorize()

        dfmerged = pd.DataFrame()
        dfmerged["productid"] = self.df["productid"]
        dfmerged["title"] = self.df["title"]
        dfmerged["info"] = pd.concat([
            self.df["title"].astype(str) + self.df["subtitle"].astype(str) +
            self.df["bullets"].astype(str)
        ],
                                     axis=1,
                                     join='inner')
        self.overall_prod_matrix = prodvectorize.fit_transform(
            dfmerged["info"])
        dfmerged["count"] = range(0, len(dfmerged))
        dfmerged["svd"] = self.overall_prod_matrix
        dfmerged["coordinate"] = dfmerged["count"].apply(
            lambda index: self.overall_prod_matrix[index, ])

        dfmerged.drop("count", axis=1)
        dfmerged.to_csv("output.csv")

        self.overall_prod_matrix = self._reducedimensionality(
            self.overall_prod_matrix, features=self.overall_weight)

        return dfmerged
        '''
示例#24
0
    def extractTopic(self):
        """	* Tokenize the all words
			* Eliminates any word with less than two letters
			* Forms term frequency–inverse document frequency for each word
			* Generate Document-term_matrix
			* Classify topics based on Document-term_matrix and frequency–inverse document frequency 
			* Gathers first "N" numbers from each topic 
		"""

        self.vectorizer = text.CountVectorizer(input='filename',
                                               stop_words='english',
                                               min_df=2)

        for x in range(len(self.fileNames)):
            temp = self.fileNames[x]
            self.fileNames[x] = self.baseDirectory + temp

        self.dtm = self.vectorizer.fit_transform(self.fileNames).toarray()
        self.vocab = np.array(self.vectorizer.get_feature_names())
        self.clf = decomposition.NMF(n_components=self.num_topics,
                                     random_state=1)

        self.doctopic = self.clf.fit_transform(self.dtm)

        for topic in self.clf.components_:
            word_idx = np.argsort(topic)[::-1][0:self.num_top_words]
            self.topic_words.append([self.vocab[i] for i in word_idx])

        for t in range(len(self.topic_words)):
            print("Topic {}: {}".format(t, ' '.join(self.topic_words[t][:15])))
示例#25
0
def count_vectorizer(df, col_name, vocab=None):
    """
  String Vectorizer With Optional Dictionary Support

  Given a Pandas DataFrame, this function will tokenizer using either the provided dictionary
  or the one implicit to the data itself.  It then returns a matrix of the tf-idf
  scores of each of the words.

  :param df: Source data frame to vectorize
  :type df: pd.DataFrame
  :param col_name: Name of the feature column in the pandas DataFrame
  :type col_name: string
  :param vocab: Dictionary of support dictionary words to mapping of index number
  :type vocab: dict
  :return: TF-IDF word matrix and vocabulary.
  :rtype: Tuple(pd.DataFrame, dict)
  """
    stop_words = nltk.corpus.stopwords.words('english')
    vectorizer = sklearntext.CountVectorizer(lowercase=True,
                                             stop_words=stop_words,
                                             vocabulary=vocab)
    doc_word_matrix = vectorizer.fit_transform(df[col_name])
    if vocab is None:
        vocab = vectorizer.vocabulary_

    tf_idf = sklearntext.TfidfTransformer(
        norm=None).fit_transform(doc_word_matrix)
    return tf_idf.toarray(), vocab
示例#26
0
    def find_NMF_topics(self):
        """
        :param num_topics:
        :param num_top_words: a list of top words for each topic
        :return:
        """

        vectorizer = text.CountVectorizer(input='filename',
                                          stop_words='english',
                                          min_df=self.min_df,
                                          max_df=self.max_df)
        dtm = vectorizer.fit_transform(self.all_documents).toarray()

        vocab = np.array(vectorizer.get_feature_names())
        clf = decomposition.NMF(n_components=self.num_topics, random_state=1)

        # it shows for how many proability each corpus is related to a word in topic results
        self.doctopic = clf.fit_transform(dtm)

        self.topic_words = []
        for topic in clf.components_:
            word_idx = np.argsort(topic)[::-1][0:self.num_top_words]
            self.topic_words.append([vocab[i] for i in word_idx])

        return
示例#27
0
def train():
    """
    Builds the SVM based on training data.
    """
    features, labels = __init__.load_data('train')

    vectorizer = text.CountVectorizer(decode_error='ignore',
                                      stop_words='english')
    transformer = text.TfidfTransformer()

    classifier = linear_model.SGDClassifier(loss='hinge',
                                            penalty='l2',
                                            alpha=1e-3,
                                            tol=1e-3,
                                            random_state=42)

    # Serializes the processing steps that would be required of the above.
    text_clf = pipeline.Pipeline(
        steps=[('vect', vectorizer), ('tfidf',
                                      transformer), ('clf-sgdc', classifier)])

    start = time.time()
    text_clf.fit(features, labels)
    print 'Training time:\t%1.4f seconds' % (time.time() - start)

    __init__.evaluate(text_clf, features, labels)

    return text_clf
示例#28
0
def count_vectorizer(docs, ngram_range=(1, 1), max_features=None, **kwargs):
    """
    Calculate counts of n-grams for a given collections of a list of string.

    Args:
        docs(List): a sequence of strings
        ngrams_range(Tuple): (min_n, max_n) to define lower and upper boundary of the range of
            n-values for different n-grams to be extracted. All values of n such
            that min_n <= n <= max_n will be used.
        max_features(int or None): If not None, build a vocabulary that only consider the top
            max_features ordered by term frequency across the corpus.

    Returns:
        term frequency (count) vectorizer
            Vectorizer is trained on the input `docs`. Can be passed to modeling functions
            or to :func:`term_doc_matrix_to_pandas` to get a Pandas DataFrame.  See the
            `scikit-learn CountVectorizer documentation
            <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_
            for attributes of the vectorizer.

    """
    docs = preprocessing._u(docs)

    vectorizer = sktext.CountVectorizer(ngram_range=ngram_range,
                                        max_features=max_features,
                                        **kwargs)
    vectorizer.fit(docs)

    return vectorizer
示例#29
0
    def __init__(self):
        # load Documents
        x = open('fedpapers_split.txt').read()
        papers = json.loads(x)

        # split Documents
        papersH = papers[0]  # papers by Hamilton
        papersM = papers[1]  # papers by Madison
        papersD = papers[2]  # disputed papers

        # Number of Documents for H, M and D
        nH = len(papersH)
        nM = len(papersM)
        nD = len(papersD)
        '''To ignore certain common words in English that might skew your model, we add them to the stop words 
        list below. You may want to experiment by choosing your own list of stop words, but be sure to keep 
        'HAMILTON' and 'MADISON' in this list at a minimum, as their names appear in the text of the papers 
        and leaving them in could lead to unpredictable results '''

        stop_words = text.ENGLISH_STOP_WORDS.union({'HAMILTON', 'MADISON'})
        #stop_words = {'HAMILTON','MADISON'}
        # Form bag of words model using words used at least 10 times
        vectorizer = text.CountVectorizer(stop_words=stop_words, min_df=10)
        X = vectorizer.fit_transform(papersH + papersM + papersD).toarray()
        '''To visualize the full list of words remaining after filtering out stop words and words used less 
        than min_df times uncomment the following line'''
        #print(vectorizer.vocabulary_)

        # split word counts into separate matrices
        self.XH, self.XM, self.XD = X[:nH, :], X[nH:nH + nM, :], X[nH + nM:, :]
示例#30
0
def build_nontext_vector(fin, colname, colidx, normalize):
    """
  Handles the specified column as a categorical variable.
  """
    print "Building category vector for %s" % (colname)
    fout = str.replace(fin, ".csv", "." + colname + ".mtx")
    if os.path.isfile(fout):
        return
    ftmp = str.replace(fin, ".csv", ".tmp")
    reader = csv.reader(open(fin, 'rb'))
    tmpwriter = open(ftmp, 'wb')
    ln = 0
    for row in reader:
        ln += 1
        if ln <= 1:
            continue
        if ln % 1000 == 0:
            print "...(processed %d lines)" % (ln)
        colval = str.lower(row[colidx])
        if normalize:
            colval = str.replace(colval, " ", "_")
        if len(colval.rstrip()) == 0:
            colval = "UNK"
        tmpwriter.write(colval + "\n")
    tmpwriter.close()
    tmpreader = open(ftmp, 'rb')
    vectorizer = sft.CountVectorizer(max_features=100)
    catmatrix = vectorizer.fit_transform(tmpreader)
    os.remove(ftmp)
    writer = open(fout, 'wb')
    sio.mmwrite(writer, catmatrix)
    writer.close()