Пример #1
0
def get_base_data(config):
    df = pd.read_csv(os.path.join(config["clean"]["baseDir"],
                                  config["vectorize"]["cleanHash"],
                                  "result.csv"),
                     low_memory=False)
    udf = pd.read_csv(os.path.join(config["clean"]["baseDir"],
                                   config["vectorize"]["cleanHash"],
                                   "useable.csv"),
                      low_memory=False)
    with open(os.path.join(config["vectorize"]["outputDir"], "info.json"),
              "r") as f:
        info = json.load(f)
    stopWords = util.getStopWords(config)

    return [{
        "all":
        df.id.count(),
        "annot":
        df[~df.notAnnot].id.count(),
        "payloadMinLength":
        config["clean"]["payloadMinLength"],
        "duplicates":
        df[~df.notAnnot][df.duplicate].id.count(),
        "useable":
        udf[udf.useable].id.count(),
        "labelsets":
        udf.labels.nunique(),
        "labelsetsOnce":
        udf.groupby(df.labels).labels.count().value_counts().get(1),
        "labelCardinality":
        sum(udf.nol) / len(udf),
        "labelDensity":
        sum(udf.nol) / (len(udf) * len(util.getLabels(config)[1:])),
        "special":
        udf[udf.special].id.count(),
        "allFeatures":
        info["allFeatures_bow"],
        "noTrain":
        info["noTrain"],
        "noTest":
        info["noTest"],
        "noTrain_train":
        info["noTrain_train"],
        "noTrain_val":
        info["noTrain_val"],
        "noStopWords":
        len(stopWords),
        "wc_mean":
        udf.wc.mean(),
        "wc_median":
        udf.wc.median(),
        "wc_first_quartile":
        udf.wc.quantile(.25)
    }]
Пример #2
0
def unigram_noStop(md):
    """
    arguments:
      md is a util.MovieData object
    returns:
      a dictionary containing a mapping from unigram features from the reviews
      to their values on this util.MovieData object, with stop words removed
    """
    unigramCount = unigram_feats(md)
    for sword in util.getStopWords():
        del unigramCount[sword]

    return unigramCount
Пример #3
0
def bigram_feats_noStop(md):
    c = Counter()
    for rev in util.MovieData.reviewers:
        if hasattr(md,rev):
            # count occurrences of asciified, lowercase, non-numeric unigrams
            # after removing punctuation
            stopWords = util.getStopWords()
            wordList = util.punct_patt.sub("",
                         util.asciify(md.__dict__[rev].strip().lower())).split()
            wordList = [x for x in wordList if util.non_numeric(x) and util.notStopWord(x, stopWords)]
            bigrams = zip(wordList, wordList[1:])
            c.update(token for token in bigrams)
    return c
Пример #4
0
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import codecs
import util
import dataImporter
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = util.getStopWords('stopwords.txt')
    
userinput = str(raw_input('Enter your list: '))    
channel_list = []
for input in userinput.split(','):
        channel_list.append(input)
for channel in channel_list:
        dataImporter.writeChannelMessagesToFile(channel+'.txt', channel)

doc_set = []
for channel in channel_list:
    doc = 'doc_' + channel
    doc = codecs.open(channel+'.txt', encoding='utf-8').read()
    doc_set.append(doc)


# list for tokenized documents in loop
texts = []

# loop through document list
for doc in doc_set: