def testWithPhoto(): corpus_all = buildAllCorpus(element_type='photos', debug=True) for key, corpus in corpus_all.items(): break ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') event = ei.getDocument() event = BaseFeatureProduction(event, corpus=corpus) print event.extractFeatures()
def generateData2(): # if sparse: #rep = Representor() all_corpus = buildAllCorpus(time_interval_length=14, debug=True) true_event_list, false_event_list = loadUnbalancedData() BaseFeatureProduction.GenerateArffFileHeader() for event in true_event_list + false_event_list: r = Region(event['region']) corpus = all_corpus[r.getKey()] BaseFeatureProduction(event, corpus, None).printFeatures()
def testWithTweet(): cnt = 0 corpus_all = buildAllCorpus(element_type='tweets', debug=False) ei = EventInterface() ei.setDB('citybeat_experiment') ei.setCollection('twitter_candidate_events') cur = ei.getAllDocuments() print TwitterFeature.GenerateArffFileHeader() for event in cur: region = Region(event['region']) event = TwitterFeature(event, corpus=corpus_all[region.getKey()]) if event.getActualValue() < 8: print '< 8' continue cnt += 1 print event.extractFeatures() print cnt, cur.count()
def testWithTweet(): from corpus import buildAllCorpus corpus_all = buildAllCorpus(element_type="tweets", debug=True) for key, corpus in corpus_all.items(): break ei = EventInterface() ei.setDB("citybeat") ei.setCollection("candidate_event_25by25_merged") event = ei.getDocument() print event ti = TweetInterface() cur = ti.getAllDocuments(limit=30) tweets = [] for tweet in cur: tweets.append(tweet) del event["photos"] event["tweets"] = tweets event = BaseFeature(event, corpus=corpus) print event.printFeatures()
def __init__(self, element_type): """Given an event, return a list incices of the photos in 'photos' filed which are representative to stands for this cluster Could overwrite TfidfVectorizer as a parameter so that you could customize your own tfidf parameters. see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html """ assert element_type in ["tweets", "photos"] self._element_type = element_type paras = {} paras["max_df"] = 0.05 paras["min_df"] = 1 paras["strip_accents"] = "ascii" paras["smooth_idf"] = True paras["preprocessor"] = self._preProcessor paras["sublinear_tf"] = True paras["norm"] = "l2" paras["analyzer"] = "char_wb" paras["ngram_range"] = (4, 4) paras["stop_words"] = "english" self._corpus_dicts_char = buildAllCorpus(element_type=self._element_type, paras=paras)