def testWithPhoto():
    corpus_all = buildAllCorpus(element_type='photos', debug=True)
    for key, corpus in corpus_all.items():
        break

    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')
    event = ei.getDocument()
    event = BaseFeatureProduction(event, corpus=corpus)
    print event.extractFeatures()
예제 #2
0
def generateData2():
#   if sparse:
    #rep = Representor()

    all_corpus = buildAllCorpus(time_interval_length=14, debug=True)
    true_event_list, false_event_list = loadUnbalancedData()
    BaseFeatureProduction.GenerateArffFileHeader()

    for event in true_event_list + false_event_list:
        r = Region(event['region'])
        corpus = all_corpus[r.getKey()]
        BaseFeatureProduction(event, corpus, None).printFeatures()
예제 #3
0
def testWithTweet():
    cnt = 0
    corpus_all = buildAllCorpus(element_type='tweets', debug=False)
    ei = EventInterface()
    ei.setDB('citybeat_experiment')
    ei.setCollection('twitter_candidate_events')
    cur = ei.getAllDocuments()
    print TwitterFeature.GenerateArffFileHeader()
    for event in cur:
        region = Region(event['region'])
        event = TwitterFeature(event, corpus=corpus_all[region.getKey()])
        if event.getActualValue() < 8:
            print '< 8'
            continue
        cnt += 1
        print event.extractFeatures()
    print  cnt, cur.count()
예제 #4
0
def testWithTweet():
    from corpus import buildAllCorpus

    corpus_all = buildAllCorpus(element_type="tweets", debug=True)
    for key, corpus in corpus_all.items():
        break

    ei = EventInterface()
    ei.setDB("citybeat")
    ei.setCollection("candidate_event_25by25_merged")
    event = ei.getDocument()
    print event
    ti = TweetInterface()
    cur = ti.getAllDocuments(limit=30)
    tweets = []
    for tweet in cur:
        tweets.append(tweet)
    del event["photos"]
    event["tweets"] = tweets
    event = BaseFeature(event, corpus=corpus)
    print event.printFeatures()
예제 #5
0
    def __init__(self, element_type):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """

        assert element_type in ["tweets", "photos"]
        self._element_type = element_type

        paras = {}
        paras["max_df"] = 0.05
        paras["min_df"] = 1
        paras["strip_accents"] = "ascii"
        paras["smooth_idf"] = True
        paras["preprocessor"] = self._preProcessor
        paras["sublinear_tf"] = True
        paras["norm"] = "l2"
        paras["analyzer"] = "char_wb"
        paras["ngram_range"] = (4, 4)
        paras["stop_words"] = "english"
        self._corpus_dicts_char = buildAllCorpus(element_type=self._element_type, paras=paras)