Exemplo n.º 1
0
    def train(self, train_path):
    	""" Train classifier on features from headline and article text """
        if self.debug:
            tick = time()
            logging.info("Training new model with %s" % (train_path,))
            logging.info("Loading/shuffling training data...")
        
        train_data_1 = Datasheet.load(train_path)

        shuffle(train_data_1)
        train_texts_1 = zip(train_data_1.columns[0], train_data_1.columns[1])
        train_labels_1 = [0 if x == '0' else 1 for x in train_data_1.columns[-1]]      
        if self.debug:
        	logging.info('Fitting training data')
        pipeline_1 = self.create_pipeline()
        pipeline_1.fit(train_texts_1, train_labels_1)
        if self.debug:
            logging.info("Done in %0.2fs" % (time() - tick,))

        train_data_2 = Datasheet()
        for row in train_data_1.rows:
            if row[-1] != '0':
                train_data_2.append(row)
        train_texts_2 = zip(train_data_2.columns[0], train_data_2.columns[1])
        train_labels_2 = train_data_2.columns[-1]
        pipeline_2 = self.create_pipeline()
        pipeline_2.fit(train_texts_2, train_labels_2)
        return pipeline_1, pipeline_2
Exemplo n.º 2
0
def scrape_news_text(news_url):

    global counter

    news_html = requests.get(news_url).content

    #    print(news_html)
    '''convert html to BeautifulSoup object'''
    news_soup = BeautifulSoup(news_html, 'lxml')
    # soup.find("div", {"id": "articlebody"})
    #    paragraphs = [par.text for par in news_soup.find_all('p')]
    #    news_text = '\n'.join(paragraphs)

    #    print(news_soup.find("div", {"id": "articleText"}))

    date_object = news_soup.find(itemprop="datePublished")
    news_object = news_soup.find("div", {"id": "articleText"})

    if date_object is None:
        return "  "

    if news_object is None:
        return "   "

    news_date = date_object.get_text(
    )  #   find("div", {"id": "articleText"}).text
    news_text = news_object.text

    #    print(news_date)
    #    print(news_text)
    print(news_url)

    try:
        # We'll store tweets in a Datasheet.
        # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
        # In the first column, we'll store a unique id for each tweet.
        # We only want to add the latest tweets, i.e., those we haven't seen yet.
        # With an index on the first column we can quickly check if an id already exists.
        # The pd() function returns the parent directory of this script + any given path.
        table = Datasheet.load(pd("nasdaq2.csv"))
    except:
        table = Datasheet()

    news_sentiment = sentiment(news_text)

    print(news_sentiment)

    table.append([counter, news_date, news_url, news_sentiment])

    table.save(pd("nasdaq2.csv"))

    counter += 1

    return news_text
Exemplo n.º 3
0
def main():
	table = Datasheet()

	url = 	URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html")
	connection = url.open()
	doc = Document( connection.read() )
	items = doc.by_class('ulamm')[1:]
	row = []
	for ul in items:
		li = ul.by_tag('li')
		kind = plaintext(ul.previous.content)
		for el in li:
			if el != None:
				save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, )
				row.append(save)
	table.append( row )
		
	table.save("files/h_torino.txt")
Exemplo n.º 4
0
def main():
	table = Datasheet()

	for cap in CAPS:
		url = 	URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap)
		connection = url.open()
		doc = Document( connection.read() )
		items = doc.by_tag("table")
		row = []
		for j, td in enumerate( items[5].by_tag('td') ):
			strcap = "%s, Telefono:" % cap
			save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n"
			if save != None:
				row.append( save )
		table.append( row )
		print  "%s ----------------------------------------------------------------------------" % str(j)
		
	table.save("files/farmacie_torino.txt")
Exemplo n.º 5
0
    def dumpdata(self):
        ''' Utility method to dump data in a csv file for later upload to the
        final database. Final database fields is found below.
        ---------------------------------------------------------------------
            1.) text = models.CharField(max_length=200)
            2.) owner = models.CharField(max_length=20)
            3.) label = models.CharField(max_length=20)
            4.) usage = models.CharField(max_length=20)
            5.) disease_type = models.CharField(max_length=20, null=True)
            6.) urlentity = models.CharField(max_length=20)
            7.) hashtagentity = models.CharField(max_length=20)
            8.) tweet_time = models.DateTimeField(db_index=True, default=datetime.now)
            9.) location= models.ForeignKey(Location, null=True, blank=True)
            10.) location_string = models.CharField(max_length=20, null=True)
            11.) from_lang = models.CharField(max_length=20)
            12.) lat
            13.) lng
            14.) country
        ''' 
        try: 
        # We extract information from database and store in a csv
            
            data_dump = Datasheet.load("corpora/twitter/datadump2.csv")
            index = dict.fromkeys(data_dump[0], True)

        except:
            data_dump = Datasheet()
            index = {}
        
        for tweet in epi.models.Tweet.objects.all(): 
            id = str(hash(tweet.owner + tweet.text))   
            
            if len(data_dump) == 0 or id not in index:
                data_dump.append([id, tweet.text, tweet.owner, tweet.label, \
                tweet.usage, '', tweet.urlentity, tweet.tweet_time,\
                '', tweet.location, ''])
                index[id] = True
                
            data_dump.save("corpora/twitter/datadump2.csv")
Exemplo n.º 6
0
    def classify(self, document):
        ''' This method is used to classify new documents. Uses the saved model.
        '''
        
        #Loading csv predictions and corpora documents.
        try: 
            nb_predictions = Datasheet.load("predictions/NB/patterns_nb.csv")
            nb_corpus = Datasheet.load("corpora/NB/nb.csv")

            index_pred = dict.fromkeys(nb_predictions.columns[0], True)
            index_corp = dict.fromkeys(nb_corpus.columns[0], True)
        except:
            nb_predictions = Datasheet()
            nb_corpus = Datasheet()
            index_pred = {}
            index_corp = {}

        #Load model from file system
        classifier = Classifier.load('models/nb_model.ept')
        label = classifier.classify(Document(document))
        probability = classifier.classify(Document(document), discrete=False)[label]

        id = str(hash(label + document))

        if ("positive" in label):
            if len(nb_predictions) == 0 or id not in index_pred:
                nb_predictions.append([id, label, document, probability])
                index_pred[id] = True
                
        if len(nb_corpus) == 0 or id not in index_corp:
            nb_corpus.append([id, label, document, probability])
            index_corp[id] = True

        nb_predictions.save("predictions/NB/patterns_nb.csv")
        nb_corpus.save("corpora/NB/nb.csv")

        return label
Exemplo n.º 7
0
    def classify(self, document):
        ''' This method is used to classify new documents. Uses the saved model.
        '''
        
        #Loading csv predictions and corpora documents.
        try: 
            svm_predictions = Datasheet.load("predictions/svm.csv")
            svm_corpus = Datasheet.load("corpora/svm/svm.csv")

            index_pred = dict.fromkeys(svm_predictions.columns[0], True)
            index_corp = dict.fromkeys(svm_corpus.columns[0], True)
        except:
            svm_predictions = Datasheet()
            svm_corpus = Datasheet()
            index_pred = {}
            index_corp = {}

        #Load model from file system
        classifier = Classifier.load('models/svm_model2.ept')
        label = classifier.classify(Document(document))

        id = str(hash(label + document))

        if ("positive" in label):
            if len(svm_predictions) == 0 or id not in index_pred:
                svm_predictions.append([id, label, document])
                index_pred[id] = True
                
        if len(svm_corpus) == 0 or id not in index_corp:
            svm_corpus.append([id, label, document])
            index_corp[id] = True

        svm_predictions.save("predictions/svm.csv")
        svm_corpus.save("corpora/svm/svm.csv")

        return label
Exemplo n.º 8
0
    # results = engine.search("#SingleLifeIn3Words", start=prev, count=100, cached=False, date='2016-02-14')
    results = engine.search("#Valentines OR #ValentinesDay", start=prev, count=100, cached=False, date='2016-02-15', since='2016-02-13')
    for tweet in results:
        print
        # print str(tweet.text)
        print tweet.author
        print tweet.date
        print hashtags(tweet.text) # Keywords in tweets start with a "#".
        print
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            # remove new lines
            tweet.text = tweet.text.replace("\n", "")
            # tweet.raw = unicode(tweet.raw).encode('utf8').replace("\n", "")
            tweet.raw = json.dumps(tweet.raw, separators=(',', ': ')).replace("\n", "")
            table.append([tweet.id, tweet.text, tweet.date, tweet.language, tweet.shares, tweet.geo, tweet.geo_lat, tweet.geo_long, tweet.user_id, tweet.location, tweet.statuses_count, tweet.followers_count, tweet.friends_count, tweet.raw])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...

# pprint(table, truncate=100)
Exemplo n.º 9
0
        print
        # print str(tweet.text)
        print tweet.author
        print tweet.date
        print hashtags(tweet.text)  # Keywords in tweets start with a "#".
        print
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            # remove new lines
            tweet.text = tweet.text.replace("\n", "")
            # tweet.raw = unicode(tweet.raw).encode('utf8').replace("\n", "")
            tweet.raw = json.dumps(tweet.raw,
                                   separators=(',', ': ')).replace("\n", "")
            table.append([
                tweet.id, tweet.text, tweet.date, tweet.language, tweet.shares,
                tweet.geo, tweet.geo_lat, tweet.geo_long, tweet.user_id,
                tweet.location, tweet.statuses_count, tweet.followers_count,
                tweet.friends_count, tweet.raw
            ])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...
    def getTweetSecureLoad(self, topic):
        # This example retrieves tweets containing given keywords from Twitter.

        self.search_topic = topic
        print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic
        self.search_topic = topic + ' film'
        try: 
            # We'll store tweets in a Datasheet.
            # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
            # In the first column, we'll store a unique id for each tweet.
            # We only want to add the latest tweets, i.e., those we haven't seen yet.
            # With an index on the first column we can quickly check if an id already exists.
            # The pd() function returns the parent directory of this script + any given path.

            table = Datasheet.load(pd(self.FILE_STORAGE))
            # index = set(table.columns[0])
            index = set(table.columns[4])   # on the text
            
        except:
            table = Datasheet()
            index = set()

        engine = Twitter(language="en")

        # With Twitter.search(cached=False), a "live" request is sent to Twitter:
        # we get the most recent results instead of those in the local cache.
        # Keeping a local cache can also be useful (e.g., while testing)
        # because a query is instant when it is executed the second time.
        prev = None

        #searchThisSubjects = search_topic

        # put headers
        table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"])

        #for oneSubject in searchThisSubjects:
        oneSubject = self.search_topic
        # oneSubject

        tweet_list_Json = []  # list of JSons
        tweet_list = []
        try:
            for i in range(1):
                for tweet in engine.search(oneSubject, start=prev, count=8, cached=False):
                    if 'http' in tweet.text:
                        posi = tweet.text.index('http')
                        tweet.text = tweet.text[0:posi-1]
                                
                    # Only add the tweet to the table if it doesn't already exists.
                    if len(table) == 0 or tweet.text not in index :
                        table.append([tweet.id, tweet.date, oneSubject, tweet.text])
                        index.add(tweet.text)
                        
                        tweet_list.append([tweet.id, tweet.date, oneSubject, tweet.text])
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text)
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text)
                        tweet.text = filter(lambda x: x in string.printable, tweet.text) # remove weird stuff
                        tweet.text = tweet.text.replace('"', '') # remove weird stuff
                        tweet.text = tweet.text.replace('\n', '') # remove weird stuff
                        tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) # remove artificiall film 
                        
                        tweet_list_Json.append(tweetJson)
                        #print tweetJson  
                        
                        # BUILD A JSON
                        #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json
                        #BUILD A LIST OF DICTIONARIES                    
                        #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object
                        
                        
                    # Continue mining older tweets in next iteration.
                    prev = tweet.text
    
        except Exception:
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)'
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!!   (film: ' + oneSubject +')' 
            pass
        
        # Create a .csv in pattern/examples/01-web/
        # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv"))
        print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " +  str(len(table)) + '\n'
        #print json.dumps(tweet_list)
        
        # return tweet_list
        return tweet_list_Json
    def getTweetSecureLoad(self, topic):
        # This example retrieves tweets containing given keywords from Twitter.

        self.search_topic = topic
        self.search_topic = topic + ' film'
        '''
        print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic
        try: 
            # We'll store tweets in a Datasheet.
            # A Datasheet is a table of rows and columns that can be exported as a CSV-file.
            # In the first column, we'll store a unique id for each tweet.
            # We only want to add the latest tweets, i.e., those we haven't seen yet.
            # With an index on the first column we can quickly check if an id already exists.
            # The pd() function returns the parent directory of this script + any given path.

            table = Datasheet.load(pd(self.FILE_STORAGE))
            # index = set(table.columns[0])
            index = set(table.columns[4])   # on the text
            
        except:
            table = Datasheet()
            index = set()
        '''

        table = Datasheet()
        index = set()

        engine = Twitter(language="en")

        # With Twitter.search(cached=False), a "live" request is sent to Twitter:
        # we get the most recent results instead of those in the local cache.
        # Keeping a local cache can also be useful (e.g., while testing)
        # because a query is instant when it is executed the second time.
        prev = None

        #searchThisSubjects = search_topic

        # put headers
        table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"])

        #for oneSubject in searchThisSubjects:
        oneSubject = self.search_topic
        # oneSubject

        tweet_list_Json = []  # list of JSons
        tweet_list = []
        try:
            for i in range(1):
                for tweet in engine.search(oneSubject,
                                           start=prev,
                                           count=8,
                                           cached=False):
                    if 'http' in tweet.text:
                        posi = tweet.text.index('http')
                        tweet.text = tweet.text[0:posi - 1]

                    # Only add the tweet to the table if it doesn't already exists.
                    if len(table) == 0 or tweet.text not in index:
                        table.append(
                            [tweet.id, tweet.date, oneSubject, tweet.text])
                        index.add(tweet.text)

                        tweet_list.append(
                            [tweet.id, tweet.date, oneSubject, tweet.text])
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text)
                        #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text)
                        tweet.text = filter(lambda x: x in string.printable,
                                            tweet.text)  # remove weird stuff
                        tweet.text = tweet.text.replace(
                            '"', '')  # remove weird stuff
                        tweet.text = tweet.text.replace(
                            '\n', '')  # remove weird stuff
                        tweetJson = self.formatData2Json(
                            tweet.id, tweet.date,
                            oneSubject.replace(' film', ''),
                            tweet.text)  # remove artificiall film

                        tweet_list_Json.append(tweetJson)
                        #print tweetJson

                        # BUILD A JSON
                        #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json
                        #BUILD A LIST OF DICTIONARIES
                        #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object

                    # Continue mining older tweets in next iteration.
                    prev = tweet.text

        except Exception:
            ''' 
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)'
            '''
            print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!!   (film: ' + oneSubject + ')'
            pass

        # Create a .csv in pattern/examples/01-web/
        # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv"))
        print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " + str(
            len(table)) + '\n'
        #print json.dumps(tweet_list)

        # return tweet_list
        return tweet_list_Json
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("eulogy", start=prev, count=25, cached=False):
        print("")
        print(tweet.text)
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print("")
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("eulogy_july_21.csv"))

print("Total results: %s" % len(table))
print("")

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further analysis, ...

pprint(table, truncate=100)
Exemplo n.º 13
0
    table = Datasheet.load(pd("tweets.csv"))
    index = set(table.columns[0])
except:
    table = Datasheet()
    index = set()

engine = Twitter(language="en")

prev = '1071765537749917696'

counter = 0

while counter < 1000:

    counter += 1
    time.sleep(60)
    for tweet in engine.search("#Apple", start=prev, count=10, cached=False):
        print(tweet.id)
        #        print(tweet.text)
        #        print(tweet.date)
        tweet_sentiment = sentiment(tweet.text)
        print(tweet_sentiment)

        if len(table) == 0 or tweet.id not in index:

            table.append([tweet.id, tweet.date, tweet.text, tweet_sentiment])
            index.add(tweet.id)

        prev = tweet.id

table.save(pd("tweets2.csv"))
csv = Datasheet()

for word, pos in lexicon.items():

    if " " not in word:

        f = frequency.get(word, frequency.get(word.lower(), 0))

        # Map to Penn Treebank II tagset.

        penn = [PENN[tag] for tag in pos if tag in PENN]
        penn += [tag] if tag in ("SYM", ".", ",", ":", "\"", "(", ")", "#",
                                 "$") else []
        penn = ", ".join(penn)

        # Collect tagged words in the .csv file.

        csv.append((f, word, penn))

        # Collect special words for post-processing.

        for tag in SPECIAL:

            if tag in pos:
                special.add(word)

csv.columns[0].sort(reverse=True)
csv.save("it-lexicon.csv")

print special
Exemplo n.º 15
0
#print adjectives

# We now want to sort the dictionary by frequency.
# The items() method of a Python dictionary returns a list of (key, value)-tuples.
# In our case, (lemma, [frequency, [form1, form2, ...]]), for example:
# ("beau", [620.07, ["beau", "beaux", "belle", "belles"]])
# We'll make a new list with the frequency at the start of each tuple.
# We can then sort the list by frequency.
adjectives = adjectives.items()
adjectives = [(weight, lemma, forms) for lemma, (weight, forms) in adjectives]
adjectives = sorted(adjectives, reverse=True) # Highest-first.
#print adjectives

# We want to save our list of adjectives as a new corpus.
# Something more manageable than 24MB.
# I prefer a new .csv file with two fields: lemma, and forms (comma-separated).
# Adjectives higher up in the list are more frequent,
# we should deal with those first to get a good coverage.
corpus = Datasheet()
for frequency, lemma, forms in adjectives:
    field1 = lemma
    field2 = ",".join(forms) # Collapse list to comma-separated string.
    corpus.append( [field1, field2] )
corpus.save("adj-fr.csv")

# We end up with a 500KB list of words commonly used to express emotion or opinion,
# sorted by how often they occur in books,
# along with their inflected forms (gender/number, such as "belles").
# The top 10 most frequent adjectives are: 
# "tout", "petit", "grand", "seul", "autre", "meme", "bon", "premier", "beau", "jeune", ...
Exemplo n.º 16
0
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
prev = None
for i in range(2):
    print(i)
    for tweet in engine.search("is cooler than", start=prev, count=25, cached=False):
        print()
        print(tweet.text.encode("utf-8"))
        print(tweet.author)
        print(tweet.date)
        print(hashtags(tweet.text))  # Keywords in tweets start with a "#".
        print()
        # Only add the tweet to the table if it doesn't already exists.
        if len(table) == 0 or tweet.id not in index:
            table.append([tweet.id, tweet.text])
            index.add(tweet.id)
        # Continue mining older tweets in next iteration.
        prev = tweet.id

# Create a .csv in pattern/examples/01-web/
table.save(pd("cool.csv"))

print("Total results:", len(table))
print()

# Print all the rows in the table.
# Since it is stored as a CSV-file it grows comfortably each time the script runs.
# We can also open the table later on: in other scripts, for further
# analysis, ...
Exemplo n.º 17
0
        # 4) Remove HTML tags:
        try:
            s = plaintext(e)
            s = s.strip()
        except:
            continue

        #if not s:
        #    print r.url
        #    print
        #    continue

        # 5) Save to CSV:
        if r.url not in seen:
            seen.add(r.url)
            csv.append((
                name,
                r.title,
                s,
                label,
            ))
            print(name, r.title)

    csv.save(pd(PATH))

# To read the dataset:
#for name, label, article in Datasheet.load(PATH):
#print(name, label, article)
#datasheet = Datasheet.load(PATH)
#print(datasheet)
Exemplo n.º 18
0
__author__ = 'Nitin'
from collocations import get_knowledge_from_collocations
from associations import get_knowledge_from_associations
from common_sense import get_knowledge_from_pattern_common_sense
from wordnet_nyms import get_knowledge_from_wordnet
from pattern.db import Datasheet
import logging

logging.basicConfig(level=logging.INFO)
logging.getLogger(__name__)

g = []
get_knowledge_from_collocations(g)
get_knowledge_from_associations(g)
get_knowledge_from_pattern_common_sense(g)
#get_knowledge_from_wordnet(g)
knowledge = [tuple([head.strip(), tail.strip(), relation]) for head, tail, relation in g if
             '.' in head and '.' in tail and '/' not in head and '/' not in tail and '(' not in head and '(' not in tail
             and ')' not in head and ')' not in tail and not head.startswith('.') and not tail.startswith('.')]

logging.info('Memorising...')
ds = Datasheet()
for speck in knowledge:
    ds.append(speck)

ds.save('knowledge.csv')
Exemplo n.º 19
0
f.close()

bigram_table = Datasheet()
all_tokens   = []

for row in table: 
    tweet = str(row).lower()
    tokens = []
    for i in range(0,len(word_tokenize(tweet))): 
        if word_tokenize(tweet)[i] == '@':
            tokens.append( str('@' + word_tokenize(tweet)[i+1]) )
        if word_tokenize(tweet)[i] == '#':
            tokens.append( str('#' + word_tokenize(tweet)[i+1]) ) 
    new_bigrams = nltk.bigrams(tokens)
    for bigram in new_bigrams: 
        bigram_table.append(bigram)
    for token in tokens:
        all_tokens.extend(tokens)

token_freq = nltk.FreqDist(all_tokens)

str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(date.today().month) + '-' + str(date.today().year) + '.csv'
bigram_table.save(str_today)

new_twitter_subjects = list(set(all_tokens))
another_table = Datasheet()

# save the original list of twitter users, we'll use this in cytoscape 
spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ', quotechar='|')
for i in list(set(all_tokens)): 
    spamWriter.writerow([i, 1])
Exemplo n.º 20
0
            e("div[class='section-blog-right']")[0]._p.extract()
            e("div[class='blog-sidebar-links']")[0]._p.extract()
            e("div[role='complementary']")[0]._p.extract()
        except:
            pass

        # 4) Remove HTML tags:
        try:
            s = plaintext(e)
            s = s.strip()
        except:
            continue

        #if not s:
        #    print r.url
        #    print
        #    continue

        # 5) Save to CSV:
        if r.url not in seen:
            seen.add(r.url)
            csv.append(
                (name, label, bias, str(level), r.title, s, r.url, r.date))
            print name, r.title
            print

    csv.save(pd(PATH))

# To read the dataset:
# for name, label, bias, level, title, article, url, date in Datasheet.load(PATH):
#     level = int(level)
Exemplo n.º 21
0
    csv = Datasheet()
    seen = set()

twitter = Twitter(language="en", license=None)

for name in celebrities():
    id = None
    for tweet in twitter.search(name, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
            print name, tweet.text
            print
            seen.add(id)
            for w in adjectives(tweet.text):
                if not w.startswith(("@", "~", "1", "2")): # filter out weirdness
                    csv.append([tweet.id, name, w])
    csv.save(PATH)

# ------------------------------------------------------------------------------------
# Dataset reader.

PATH = pd("properties.csv")

f = {} # {celebrity: {property: count}}
for id, name, p in Datasheet.load(PATH):
    if name not in f:
        f[name] = {}     # {"Justin Bieber": {}}
    if p not in f[name]:
        f[name][p] = 0   # {"Justin Bieber": {"gay": 0}}
    f[name][p] += 1      # {"Justin Bieber": {"gay": 1}}
        
Exemplo n.º 22
0
    # In the first column, we'll store a unique ID for each tweet.
    # We only want to add the latest facebook status, i.e. those we haven't previously encountered.
    # With an index on the first column we can quickly check if an ID already exists.
    # The index becomes important once more and more rows are added to the table (speed).
    table = Datasheet.load("travel.txt")
    index = dict.fromkeys(table.columns[0], True)
except:
    table = Datasheet()
    index = {}

engine = Facebook()

# With cached=False, a live request is sent to Facebook,
# so we get the latest results for the query instead of those in the local cache.
for status in engine.search("Travelling to", count=25, cached=False):
    print status.description
    print status.author
    print status.date
    print
    id = status.url
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, status.description])
        index[id] = True

table.save("travel.txt")

print "Total results:", len(table)
print

Exemplo n.º 23
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description) # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = str(hash(tweet.author + tweet.description))
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.csv")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
Exemplo n.º 24
0
id = None
for i in range(3):
    # Look for tweets containing the search query.
    # We can get a maximum of 100 tweets per search.
    # Don't cache the results locally,
    # so we get the latest new tweets when the script runs.
    # Do this 3x.
    for tweet in twitter.search(q, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
            print tweet.text
            print
            seen.add(id)
            csv.append([
                tweet.id, q, tweet.author, tweet.text, tweet.retweets,
                tweet.date
            ])
    # Update the CSV.
    csv.save(PATH)

# Each time you run the script, new tweets will be appended to the CSV.
# For example, we have Twitter miners that automatically run 10x each day,
# and have been running for many days and weeks.

# We can then import the data in other scripts, e.g.:

#from pattern.db import Datasheet, pd
#csv = Datasheet.load(pd("tweets.csv"))
#for id, q, author, text, retweets, date in csv:
#    print text
Exemplo n.º 25
0
    seen = set()

twitter = Twitter(language="en", license=None)

for name in celebrities():
    id = None
    for tweet in twitter.search(name, start=id, count=100, cached=False):
        id = tweet.id
        if id not in seen:
            print name, tweet.text
            print
            seen.add(id)
            for w in adjectives(tweet.text):
                if not w.startswith(
                    ("@", "~", "1", "2")):  # filter out weirdness
                    csv.append([tweet.id, name, w])
    csv.save(PATH)

# ------------------------------------------------------------------------------------
# Dataset reader.

PATH = pd("properties.csv")

f = {}  # {celebrity: {property: count}}
for id, name, p in Datasheet.load(PATH):
    if name not in f:
        f[name] = {}  # {"Justin Bieber": {}}
    if p not in f[name]:
        f[name][p] = 0  # {"Justin Bieber": {"gay": 0}}
    f[name][p] += 1  # {"Justin Bieber": {"gay": 1}}
Exemplo n.º 26
0
f.close()

bigram_table = Datasheet()
all_tokens = []

for row in table:
    tweet = str(row).lower()
    tokens = []
    for i in range(0, len(word_tokenize(tweet))):
        if word_tokenize(tweet)[i] == '@':
            tokens.append(str('@' + word_tokenize(tweet)[i + 1]))
        if word_tokenize(tweet)[i] == '#':
            tokens.append(str('#' + word_tokenize(tweet)[i + 1]))
    new_bigrams = nltk.bigrams(tokens)
    for bigram in new_bigrams:
        bigram_table.append(bigram)
    for token in tokens:
        all_tokens.extend(tokens)

token_freq = nltk.FreqDist(all_tokens)

str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(
    date.today().month) + '-' + str(date.today().year) + '.csv'
bigram_table.save(str_today)

new_twitter_subjects = list(set(all_tokens))
another_table = Datasheet()

# save the original list of twitter users, we'll use this in cytoscape
spamWriter = csv.writer(open('original_twitter.csv', 'wb'),
                        delimiter=' ',
Exemplo n.º 27
0
import re

from pattern.web import Twitter,hashtags,author
from pattern.db  import Datasheet, pprint, pd
from pattern.web import Google, plaintext


table = Datasheet()
index = set()
texts = set()


table.append(['Tweets'])


'''
Cleaning the tweets extracted from a particular user timeline,
code is available in another file
'''


with open('extracted_tweets_translated.txt') as f:
    for tweet in f.readlines():
        clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split())
        print

        print clean_text
        print author(tweet)
        print hashtags(tweet)

        print
# Data Science is a field in computer science that is dedicated to analyzing patterns in raw data using
# techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and
# statistical algorithms.
# Pattern is a web mining module for the Python programming language.
# It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning
# (vector space model, clustering, SVM), network analysis and <canvas> visualization.
# Twitter opinion Mining.

from pattern.web import Twitter, plaintext
from pattern.db import Datasheet
from pattern.nl import sentiment as sentiment_nl
from pattern.fr import sentiment as sentiment_fr

csv = Datasheet()

for politician, party in (("bart de wever", "NV-A"), ("elio di rupo", "PS")):

    for tweet in Twitter().search(politician):

        if tweet.language in ("nl", "fr"):
            s = plaintext(tweet.description)

            if tweet.language == "nl":
                w = sentiment_nl(s)

            if tweet.language == "fr":
                w = sentiment_fr(s)

            csv.append([politician, party, tweet.date, s, w])
Exemplo n.º 29
0
    seen = set(csv.columns[0])
except:
    csv = Datasheet()
    seen = set()

for genre, url in feeds.items():
    for r in Newsfeed().search(url, cached=False):
        if r.url not in seen:
            print r.title
            print
            try:
                src = URL(r.url).download(cached=True)
                dom = DOM(src)
                txt = []

                # Daily Star has untidy HTML markup.
                # Collect the article <p> by <p>.
                for p in dom('.story-content p'):
                    if p.parent.tag == 'blockquote':
                        continue
                    s = plaintext(p)
                    s = s.strip()
                    if s != s.upper(
                    ):  # Exclude references ("GETTY", "YOUTUBE").
                        txt.append(s)

                seen.add(r.url)
                csv.append((r.url, r.title, '\n\n'.join(txt), genre))
            except:
                pass
    csv.save(PATH)
Exemplo n.º 30
0
    index2 = dict.fromkeys(table.columns[1], True)
except:
    table = Datasheet()
    index = {}
    index2 = {}


engine = Twitter(language="en")
comparray=[" "] #spam filter 
# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for i in range(1, 10000):
    for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100):
        comparray.append(tweet.text[0:15])
        print tweet.text
        print tweet.author
        print tweet.date
        print hashtags(tweet.text) # Keywords in tweets start with a #.
        print
        # Create a unique ID based on the tweet content and author.
        id = str(hash(tweet.author + tweet.text))
        # Only add the tweet to the table if it doesn't already contain this ID.     
        if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]:
            table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]])
            index[id] = True

table.save("tweets.csv")

print "Total results:", len(table)
print
Exemplo n.º 31
0
# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Bing and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'          # Bing search query
p = "NP (VP) more important than NP" # Search pattern.
p = Pattern.fromstring(p)
d = Datasheet()

engine = Bing(license=None)
for i in range(1): # max=10
    for result in engine.search(q, start=i+1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1] # Left NP.
            b = m.constituents(constraint=5)[ 0] # Right NP.
            d.append((
                a.string.lower(), 
                b.string.lower()))

pprint(d)

print
print len(d), "results."
Exemplo n.º 32
0
engine = Twitter(language="en")

# With cached=False, a live request is sent to Twitter,
# so we get the latest results for the query instead of those in the local cache.
for tweet in engine.search("is cooler than", count=25, cached=False):
    print tweet.description
    print tweet.author
    print tweet.date
    print hashtags(tweet.description)  # Keywords in tweets start with a #.
    print
    # Create a unique ID based on the tweet content and author.
    id = hash(tweet.author + tweet.description)
    # Only add the tweet to the table if it doesn't already contain this ID.
    if len(table) == 0 or id not in index:
        table.append([id, tweet.description])
        index[id] = True

table.save("cool.txt")

print "Total results:", len(table)
print

# Print all the rows in the table.
# Since it is stored as a file it can grow comfortably each time the script runs.
# We can also open the table later on, in other scripts, for further analysis.
#pprint(table)

# Note: you can also search tweets by author:
# Twitter().search("from:tom_de_smedt")
Exemplo n.º 33
0
                    # The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">.
                    score = review.by_class("swSprite")[0]
                    score = score.attributes["title"]
                    score = score.split(" ")[0]
                    score = float(score)
                
                    # The review is contained as plain text in the <div>.
                    text = ""
                    for child in review.children:
                        if child.type == "text":
                            text += child.source + " "
                    text = text.strip()
                    text = plaintext(text) # Remove HTML entities, tags, etc.
                    
                    if text:
                        corpus.append((text, score))
                        print score
                        print text
                        print
                
                except Exception, e:
                    #print e
                    pass

        # Now and then, save the corpus of (review, score) items as a .csv file.
        corpus.save("books-fr.csv")
        
# Can you think of other test data to mine for?
# Can you see why it would be useful to have different test sets?
# - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag?
# - How about hotel reviews + star rating from http://fr.hotels.com?
Exemplo n.º 34
0
# With Facebook.search(cached=False), a "live" request is sent to Facebook:
# we get the most recent results instead of those in the local cache.
# Keeping a local cache can also be useful (e.g., while testing)
# because a query is instant when it is executed the second time.
for status in fb.search("horrible", count=25, cached=False):
    print("=" * 100)
    print(status.id)
    print(status.text.encode("utf-8"))
    print(status.author)  # Yields an (id, name)-tuple.
    print(status.date)
    print(status.likes)
    print(status.comments)
    print()
    # Only add the tweet to the table if it doesn't already exists.
    if len(table) == 0 or status.id not in index:
        table.append([status.id, status.text])
        index.add(status.id)

# Create a .csv in pattern/examples/01-web/
table.save(pd("opinions.csv"))

# 2) Status updates from specific profiles.
#    For this you need a personal license key:
#    http://www.clips.ua.ac.be/pattern-facebook

license = ""

if license != "":
    fb = Facebook(license)
    # Facebook.profile() returns a dictionary with author info.
    # By default, this is your own profile.
Exemplo n.º 35
0
                    # The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">.
                    score = review.by_class("swSprite")[0]
                    score = score.attributes["title"]
                    score = score.split(" ")[0]
                    score = float(score)

                    # The review is contained as plain text in the <div>.
                    text = ""
                    for child in review.children:
                        if child.type == "text":
                            text += child.source + " "
                    text = text.strip()
                    text = plaintext(text)  # Remove HTML entities, tags, etc.

                    if text:
                        corpus.append((text, score))
                        print score
                        print text
                        print

                except Exception, e:
                    #print e
                    pass

        # Now and then, save the corpus of (review, score) items as a .csv file.
        corpus.save("books-fr.csv")

# Can you think of other test data to mine for?
# Can you see why it would be useful to have different test sets?
# - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag?
# - How about hotel reviews + star rating from http://fr.hotels.com?