示例#1
0
            print t, c, mu, sigma, (mu/sigma)



train()
collocation('trendy', min_count=1)


#---------Use NLTK collocations--------------
import nltk
import re
import HTMLParser

import datastores.datastore as d

df = d.solr_data_frame('Beauty_Crawl_RSS_Feeds')

h = HTMLParser.HTMLParser()
for i,d in enumerate(documents):
    documents[i] = h.unescape(documents[i]).encode('utf-8')


def tokenize(documents):
    for i,doc in enumerate(documents):
        if i % 100 == 0:
            print '%d of %d' % (i, len(documents))
        for sent in nltk.sent_tokenize(doc.lower()):
            for word in nltk.word_tokenize(sent):
                yield word

示例#2
0
__author__ = 'sriWork'
import datastores.datastore as ds

COLLECTION = 'Health_Crawl_RSS_Feeds'
#COLLECTION = 'Health_Crawl_RSS_Feeds'
FIELDS = [ 'id','title','content,' 'pubDate_dt', 'tags_s', 'lang','author']
QUERY = None
CACHE=False

#####  Read solr data into 'dataframe' #####
dataframe=[]
dataframe = ds.solr_data_frame(COLLECTION, FIELDS, QUERY,CACHE)
##print(dataframe['content'][373])
length_dataframe=len(dataframe)

#### Count the number of english and spanish documents and print the other language tags

cnt_eng=0
cnt_es=0
for i in range(0,length_dataframe):
    #content_currentframe=dataframe['content'][i]
    #print(dataframe['lang'][i])
    if dataframe['lang'][i]==[u'en']:
        cnt_eng=cnt_eng+1
    elif dataframe['lang'][i]==[u'es']:
        cnt_es=cnt_es+1
    #else:
        #print i
        #print(dataframe['lang'][i])

    #print(content_currentframe)