Exemplo n.º 1
0
def train_tsne(training_size=2000,
               metric='cosine',
               n_components=3,
               perplexity=100,
               angle=.12):
    # adjust this downward to see it it affects accuracy
    np = pd.np

    tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz'))
    tweets = tweets[tweets.isbot >= 0]
    gc.collect()  # reclaim RAM released above

    # labels3 = tweets.isbot.apply(lambda x: int(x * 3))
    labels = tweets.isbot.apply(lambda x: int(x * 2))

    lsa = LsiModel.load(
        os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl'))
    tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word)
    bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text])
    # tfidfs = tfidf[bows]

    X = pd.DataFrame(
        [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))],
        index=tweets.index)

    mask = ~X.isnull().any(axis=1)
    mask.index = tweets.index
    # >>> sum(~mask)
    # 99
    # >>> tweets.loc[mask.argmin()]
    # isbot                 0.17
    # strict                  13
    # user      b'CrisParanoid:'
    # text         b'#sad again'
    # Name: 571, dtype: object

    X = X[mask]
    y = tweets.isbot[mask]
    labels = labels[mask]

    test_size = 1.0 - training_size if training_size < 1 else float(
        len(X) - training_size) / len(X)
    Xindex, Xindex_test, yindex, yindex_test = train_test_split(
        X.index.values, y.index.values, test_size=test_size)
    X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[
        yindex], y.loc[yindex_test]

    # labels_test = labels.loc[yindex_test]
    labels = labels.loc[yindex]

    tsne = TSNE(metric='precomputed',
                n_components=n_components,
                angle=angle,
                perplexity=perplexity)
    tsne = tsne.fit(positive_distances(X.values, metric=metric))

    return tsne, X, Xtest, y, ytest
Exemplo n.º 2
0
def lsa_twitter(cased_tokens):
    """ Latent Sentiment Analyis on random sampling of twitter search results for words listed in cased_tokens """

    # Only 5 of these tokens are saved for a no_below=2 filter:
    #   PyCons NLPS #PyCon2016 #NaturalLanguageProcessing #naturallanguageprocessing
    if cased_tokens is None:
        cased_tokens = ('PyConOpenSpaces PyCon PyCon2017 PyCon2018 PyCon2016 PyCon2015 OpenSpace PyconTutorial ' +
                        'NLP NaturalLanguageProcessing NLPInAction NaturalLanguageProcessingInAction NLPIA Twote Twip'
                        ).split()
        cased_tokens += [s + 's' for s in cased_tokens]

        cased_tokens += 'TotalGood TotalGoods HobsonLane Hob Hobs TotalGood.com ' \
                        'www.TotalGood.com http://www.TotalGood.com https://www.TotalGood.com'.split()

    allcase_tokens = cased_tokens + [s.lower() for s in cased_tokens]
    allcase_tokens += [s.title() for s in cased_tokens]
    allcase_tokens += [s.upper() for s in cased_tokens]
    KEEP_TOKENS = allcase_tokens + ['#' + s for s in allcase_tokens]

    # takes 15 minutes and 10GB of RAM for 500k tweets if you keep all 20M unique tokens/names URLs
    vocab_path = os.path.join(BIGDATA_PATH, 'vocab939370.pkl')
    if os.path.isfile(vocab_path):
        print('Loading vocab: {} ...'.format(vocab_path))
        vocab = Dictionary.load(vocab_path)
        print(' len(vocab) loaded: {}'.format(len(vocab.dfs)))
    else:
        tweets_path = os.path.join(BIGDATA_PATH, 'tweets.csv.gz')
        print('Loading tweets: {} ...'.format(tweets_path))
        tweets = read_csv(tweets_path)
        tweets = np.array(tweets.text.str.split())
        with gzip.open(os.path.join(BIGDATA_PATH, 'tweets.txt.gz'), 'w') as f:
            for tokens in tweets:
                f.write((' '.join(tokens) + '\n').encode('utf-8'))
        # tweets['text'] = tweets.text.apply(lambda s: eval(s).decode('utf-8'))
        # tweets['user'] = tweets.user.apply(lambda s: eval(s).decode('utf-8'))
        # tweets.to_csv('tweets.csv.gz', compression='gzip')
        print('Computing vocab from {} tweets...'.format(len(tweets)))
        vocab = Dictionary(tweets, no_below=NO_BELOW, no_above=NO_ABOVE, keep_tokens=set(KEEP_TOKENS))

    vocab.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N, keep_tokens=set(KEEP_TOKENS))
    print(' len(vocab) after filtering: {}'.format(len(vocab.dfs)))

    # no time at all, just a bookeeping step, doesn't actually compute anything
    tfidf = TfidfModel(id2word=vocab, dictionary=vocab)
    tfidf.save(os.path.join(BIGDATA_PATH, 'tfidf{}.pkl'.format(len(vocab.dfs))))

    tweets = [vocab.doc2bow(tw) for tw in tweets]
    json.dump(tweets, gzip.open(os.path.join(BIGDATA_PATH, 'tweet_bows.json.gz'), 'w'))

    gc.collect()

    # LSA is more useful name than LSA
    lsa = LsiModel(tfidf[tweets], num_topics=200, id2word=vocab, extra_samples=100, power_iters=2)

    return lsa
Exemplo n.º 3
0
def clean_df(df, header=None, **read_csv_kwargs):
    """ Convert UTF8 characters in a CSV file or dataframe into ASCII

    Args:
      df (DataFrame or str): DataFrame or path or url to CSV
    """
    df = read_csv(df, header=header, **read_csv_kwargs)
    df = df.fillna(' ')
    for col in df.columns:
        df[col] = df[col].apply(unicode2ascii)
    return df
Exemplo n.º 4
0
def train_lda(training_size=2000, metric='cosine'):
    tweets = read_csv(os.path.join(BIGDATA_PATH, 'tweets.csv.gz'))
    tweets = tweets[tweets.isbot >= 0]

    # labels3 = tweets.isbot.apply(lambda x: int(x * 3))
    labels = tweets.isbot.apply(lambda x: int(x * 2))

    lsa = LsiModel.load(
        os.path.join(BIGDATA_PATH, 'lsa_tweets_5589798_2003588x200.pkl'))
    tfidf = TfidfModel(id2word=lsa.id2word, dictionary=lsa.id2word)
    bows = np.array([lsa.id2word.doc2bow(txt.split()) for txt in tweets.text])
    # tfidfs = tfidf[bows]

    X = pd.DataFrame(
        [pd.Series(dict(v)) for v in tqdm(lsa[tfidf[bows]], total=len(bows))],
        index=tweets.index)
    mask = ~X.isnull().any(axis=1)
    mask.index = tweets.index
    X = X[mask]
    y = tweets.isbot[mask]
    labels = labels[mask]
    # labels3 = labels3[mask]

    test_size = 1.0 - training_size if training_size < 1 else float(
        len(X) - training_size) / len(X)
    Xindex, Xindex_test, yindex, yindex_test = train_test_split(
        X.index.values, y.index.values, test_size=test_size)
    X, Xtest, y, ytest = X.loc[Xindex], X.loc[Xindex_test], y.loc[
        yindex], y.loc[yindex_test]
    labels_test = labels.loc[yindex_test]
    labels = labels.loc[yindex]

    lda = LDA('lsqr', 'auto', n_components=3)
    print(cross_val_score(lda, Xtest, labels_test, cv=7))

    lda = LDA('lsqr', 'auto', n_components=3)
    lda = lda.fit(X.values, labels.values)
    y_lda = lda.predict(Xtest)
    print(mean_squared_error(y_lda, ytest))

    df_test = pd.DataFrame(lda.predict(Xtest),
                           index=Xtest.index,
                           columns=['predict'])
    df_test['truth'] = labels_test
    return lda, df_test