def train(self, train_path): """ Train classifier on features from headline and article text """ if self.debug: tick = time() logging.info("Training new model with %s" % (train_path,)) logging.info("Loading/shuffling training data...") train_data_1 = Datasheet.load(train_path) shuffle(train_data_1) train_texts_1 = zip(train_data_1.columns[0], train_data_1.columns[1]) train_labels_1 = [0 if x == '0' else 1 for x in train_data_1.columns[-1]] if self.debug: logging.info('Fitting training data') pipeline_1 = self.create_pipeline() pipeline_1.fit(train_texts_1, train_labels_1) if self.debug: logging.info("Done in %0.2fs" % (time() - tick,)) train_data_2 = Datasheet() for row in train_data_1.rows: if row[-1] != '0': train_data_2.append(row) train_texts_2 = zip(train_data_2.columns[0], train_data_2.columns[1]) train_labels_2 = train_data_2.columns[-1] pipeline_2 = self.create_pipeline() pipeline_2.fit(train_texts_2, train_labels_2) return pipeline_1, pipeline_2
def scrape_news_text(news_url): global counter news_html = requests.get(news_url).content # print(news_html) '''convert html to BeautifulSoup object''' news_soup = BeautifulSoup(news_html, 'lxml') # soup.find("div", {"id": "articlebody"}) # paragraphs = [par.text for par in news_soup.find_all('p')] # news_text = '\n'.join(paragraphs) # print(news_soup.find("div", {"id": "articleText"})) date_object = news_soup.find(itemprop="datePublished") news_object = news_soup.find("div", {"id": "articleText"}) if date_object is None: return " " if news_object is None: return " " news_date = date_object.get_text( ) # find("div", {"id": "articleText"}).text news_text = news_object.text # print(news_date) # print(news_text) print(news_url) try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd("nasdaq2.csv")) except: table = Datasheet() news_sentiment = sentiment(news_text) print(news_sentiment) table.append([counter, news_date, news_url, news_sentiment]) table.save(pd("nasdaq2.csv")) counter += 1 return news_text
def main(): table = Datasheet() url = URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html") connection = url.open() doc = Document( connection.read() ) items = doc.by_class('ulamm')[1:] row = [] for ul in items: li = ul.by_tag('li') kind = plaintext(ul.previous.content) for el in li: if el != None: save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, ) row.append(save) table.append( row ) table.save("files/h_torino.txt")
def main(): table = Datasheet() for cap in CAPS: url = URL("http://www.comuni-italiani.it/001/272/farmacie/cap%s.html" % cap) connection = url.open() doc = Document( connection.read() ) items = doc.by_tag("table") row = [] for j, td in enumerate( items[5].by_tag('td') ): strcap = "%s, Telefono:" % cap save = "%s" % plaintext(td.content).replace('\n', ',', 3).replace("Telefono:", strcap).replace(";", "").replace("Partita Iva", ",Partita Iva") + "\n" if save != None: row.append( save ) table.append( row ) print "%s ----------------------------------------------------------------------------" % str(j) table.save("files/farmacie_torino.txt")
def dumpdata(self): ''' Utility method to dump data in a csv file for later upload to the final database. Final database fields is found below. --------------------------------------------------------------------- 1.) text = models.CharField(max_length=200) 2.) owner = models.CharField(max_length=20) 3.) label = models.CharField(max_length=20) 4.) usage = models.CharField(max_length=20) 5.) disease_type = models.CharField(max_length=20, null=True) 6.) urlentity = models.CharField(max_length=20) 7.) hashtagentity = models.CharField(max_length=20) 8.) tweet_time = models.DateTimeField(db_index=True, default=datetime.now) 9.) location= models.ForeignKey(Location, null=True, blank=True) 10.) location_string = models.CharField(max_length=20, null=True) 11.) from_lang = models.CharField(max_length=20) 12.) lat 13.) lng 14.) country ''' try: # We extract information from database and store in a csv data_dump = Datasheet.load("corpora/twitter/datadump2.csv") index = dict.fromkeys(data_dump[0], True) except: data_dump = Datasheet() index = {} for tweet in epi.models.Tweet.objects.all(): id = str(hash(tweet.owner + tweet.text)) if len(data_dump) == 0 or id not in index: data_dump.append([id, tweet.text, tweet.owner, tweet.label, \ tweet.usage, '', tweet.urlentity, tweet.tweet_time,\ '', tweet.location, '']) index[id] = True data_dump.save("corpora/twitter/datadump2.csv")
def classify(self, document): ''' This method is used to classify new documents. Uses the saved model. ''' #Loading csv predictions and corpora documents. try: nb_predictions = Datasheet.load("predictions/NB/patterns_nb.csv") nb_corpus = Datasheet.load("corpora/NB/nb.csv") index_pred = dict.fromkeys(nb_predictions.columns[0], True) index_corp = dict.fromkeys(nb_corpus.columns[0], True) except: nb_predictions = Datasheet() nb_corpus = Datasheet() index_pred = {} index_corp = {} #Load model from file system classifier = Classifier.load('models/nb_model.ept') label = classifier.classify(Document(document)) probability = classifier.classify(Document(document), discrete=False)[label] id = str(hash(label + document)) if ("positive" in label): if len(nb_predictions) == 0 or id not in index_pred: nb_predictions.append([id, label, document, probability]) index_pred[id] = True if len(nb_corpus) == 0 or id not in index_corp: nb_corpus.append([id, label, document, probability]) index_corp[id] = True nb_predictions.save("predictions/NB/patterns_nb.csv") nb_corpus.save("corpora/NB/nb.csv") return label
def classify(self, document): ''' This method is used to classify new documents. Uses the saved model. ''' #Loading csv predictions and corpora documents. try: svm_predictions = Datasheet.load("predictions/svm.csv") svm_corpus = Datasheet.load("corpora/svm/svm.csv") index_pred = dict.fromkeys(svm_predictions.columns[0], True) index_corp = dict.fromkeys(svm_corpus.columns[0], True) except: svm_predictions = Datasheet() svm_corpus = Datasheet() index_pred = {} index_corp = {} #Load model from file system classifier = Classifier.load('models/svm_model2.ept') label = classifier.classify(Document(document)) id = str(hash(label + document)) if ("positive" in label): if len(svm_predictions) == 0 or id not in index_pred: svm_predictions.append([id, label, document]) index_pred[id] = True if len(svm_corpus) == 0 or id not in index_corp: svm_corpus.append([id, label, document]) index_corp[id] = True svm_predictions.save("predictions/svm.csv") svm_corpus.save("corpora/svm/svm.csv") return label
# results = engine.search("#SingleLifeIn3Words", start=prev, count=100, cached=False, date='2016-02-14') results = engine.search("#Valentines OR #ValentinesDay", start=prev, count=100, cached=False, date='2016-02-15', since='2016-02-13') for tweet in results: print # print str(tweet.text) print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a "#". print # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: # remove new lines tweet.text = tweet.text.replace("\n", "") # tweet.raw = unicode(tweet.raw).encode('utf8').replace("\n", "") tweet.raw = json.dumps(tweet.raw, separators=(',', ': ')).replace("\n", "") table.append([tweet.id, tweet.text, tweet.date, tweet.language, tweet.shares, tweet.geo, tweet.geo_lat, tweet.geo_long, tweet.user_id, tweet.location, tweet.statuses_count, tweet.followers_count, tweet.friends_count, tweet.raw]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("cool.csv")) print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further analysis, ... # pprint(table, truncate=100)
print # print str(tweet.text) print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a "#". print # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: # remove new lines tweet.text = tweet.text.replace("\n", "") # tweet.raw = unicode(tweet.raw).encode('utf8').replace("\n", "") tweet.raw = json.dumps(tweet.raw, separators=(',', ': ')).replace("\n", "") table.append([ tweet.id, tweet.text, tweet.date, tweet.language, tweet.shares, tweet.geo, tweet.geo_lat, tweet.geo_long, tweet.user_id, tweet.location, tweet.statuses_count, tweet.followers_count, tweet.friends_count, tweet.raw ]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("cool.csv")) print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further analysis, ...
def getTweetSecureLoad(self, topic): # This example retrieves tweets containing given keywords from Twitter. self.search_topic = topic print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic self.search_topic = topic + ' film' try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd(self.FILE_STORAGE)) # index = set(table.columns[0]) index = set(table.columns[4]) # on the text except: table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None #searchThisSubjects = search_topic # put headers table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"]) #for oneSubject in searchThisSubjects: oneSubject = self.search_topic # oneSubject tweet_list_Json = [] # list of JSons tweet_list = [] try: for i in range(1): for tweet in engine.search(oneSubject, start=prev, count=8, cached=False): if 'http' in tweet.text: posi = tweet.text.index('http') tweet.text = tweet.text[0:posi-1] # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.text not in index : table.append([tweet.id, tweet.date, oneSubject, tweet.text]) index.add(tweet.text) tweet_list.append([tweet.id, tweet.date, oneSubject, tweet.text]) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) tweet.text = filter(lambda x: x in string.printable, tweet.text) # remove weird stuff tweet.text = tweet.text.replace('"', '') # remove weird stuff tweet.text = tweet.text.replace('\n', '') # remove weird stuff tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) # remove artificiall film tweet_list_Json.append(tweetJson) #print tweetJson # BUILD A JSON #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json #BUILD A LIST OF DICTIONARIES #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object # Continue mining older tweets in next iteration. prev = tweet.text except Exception: print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)' print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! (film: ' + oneSubject +')' pass # Create a .csv in pattern/examples/01-web/ # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv")) print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " + str(len(table)) + '\n' #print json.dumps(tweet_list) # return tweet_list return tweet_list_Json
def getTweetSecureLoad(self, topic): # This example retrieves tweets containing given keywords from Twitter. self.search_topic = topic self.search_topic = topic + ' film' ''' print 'CLASS (Twitter_PatternPKG) - Twitter Secure Initial Load - Topic: ' + self.search_topic try: # We'll store tweets in a Datasheet. # A Datasheet is a table of rows and columns that can be exported as a CSV-file. # In the first column, we'll store a unique id for each tweet. # We only want to add the latest tweets, i.e., those we haven't seen yet. # With an index on the first column we can quickly check if an id already exists. # The pd() function returns the parent directory of this script + any given path. table = Datasheet.load(pd(self.FILE_STORAGE)) # index = set(table.columns[0]) index = set(table.columns[4]) # on the text except: table = Datasheet() index = set() ''' table = Datasheet() index = set() engine = Twitter(language="en") # With Twitter.search(cached=False), a "live" request is sent to Twitter: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None #searchThisSubjects = search_topic # put headers table.append(["tweet_id", "tweet_date", "InputSubject", "Tweet_text"]) #for oneSubject in searchThisSubjects: oneSubject = self.search_topic # oneSubject tweet_list_Json = [] # list of JSons tweet_list = [] try: for i in range(1): for tweet in engine.search(oneSubject, start=prev, count=8, cached=False): if 'http' in tweet.text: posi = tweet.text.index('http') tweet.text = tweet.text[0:posi - 1] # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.text not in index: table.append( [tweet.id, tweet.date, oneSubject, tweet.text]) index.add(tweet.text) tweet_list.append( [tweet.id, tweet.date, oneSubject, tweet.text]) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject, tweet.text) #tweetJson = self.formatData2Json(tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) tweet.text = filter(lambda x: x in string.printable, tweet.text) # remove weird stuff tweet.text = tweet.text.replace( '"', '') # remove weird stuff tweet.text = tweet.text.replace( '\n', '') # remove weird stuff tweetJson = self.formatData2Json( tweet.id, tweet.date, oneSubject.replace(' film', ''), tweet.text) # remove artificiall film tweet_list_Json.append(tweetJson) #print tweetJson # BUILD A JSON #http://stackoverflow.com/questions/14547916/how-can-i-loop-over-entries-in-json #BUILD A LIST OF DICTIONARIES #http://stackoverflow.com/questions/2733813/iterating-through-a-json-object # Continue mining older tweets in next iteration. prev = tweet.text except Exception: ''' print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! - ([twitter_patternPkg_connector] getTweetSecureLoad)' ''' print 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX - ERROR!! (film: ' + oneSubject + ')' pass # Create a .csv in pattern/examples/01-web/ # table.save(pd("OD_CK1_Source4_Tweeter_InitialLoad.csv")) print "CLASS (Twitter_PatternPKG) - Total Secure Twitter Load: " + str( len(table)) + '\n' #print json.dumps(tweet_list) # return tweet_list return tweet_list_Json
# we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("eulogy", start=prev, count=25, cached=False): print("") print(tweet.text) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print("") # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("eulogy_july_21.csv")) print("Total results: %s" % len(table)) print("") # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further analysis, ... pprint(table, truncate=100)
table = Datasheet.load(pd("tweets.csv")) index = set(table.columns[0]) except: table = Datasheet() index = set() engine = Twitter(language="en") prev = '1071765537749917696' counter = 0 while counter < 1000: counter += 1 time.sleep(60) for tweet in engine.search("#Apple", start=prev, count=10, cached=False): print(tweet.id) # print(tweet.text) # print(tweet.date) tweet_sentiment = sentiment(tweet.text) print(tweet_sentiment) if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.date, tweet.text, tweet_sentiment]) index.add(tweet.id) prev = tweet.id table.save(pd("tweets2.csv"))
csv = Datasheet() for word, pos in lexicon.items(): if " " not in word: f = frequency.get(word, frequency.get(word.lower(), 0)) # Map to Penn Treebank II tagset. penn = [PENN[tag] for tag in pos if tag in PENN] penn += [tag] if tag in ("SYM", ".", ",", ":", "\"", "(", ")", "#", "$") else [] penn = ", ".join(penn) # Collect tagged words in the .csv file. csv.append((f, word, penn)) # Collect special words for post-processing. for tag in SPECIAL: if tag in pos: special.add(word) csv.columns[0].sort(reverse=True) csv.save("it-lexicon.csv") print special
#print adjectives # We now want to sort the dictionary by frequency. # The items() method of a Python dictionary returns a list of (key, value)-tuples. # In our case, (lemma, [frequency, [form1, form2, ...]]), for example: # ("beau", [620.07, ["beau", "beaux", "belle", "belles"]]) # We'll make a new list with the frequency at the start of each tuple. # We can then sort the list by frequency. adjectives = adjectives.items() adjectives = [(weight, lemma, forms) for lemma, (weight, forms) in adjectives] adjectives = sorted(adjectives, reverse=True) # Highest-first. #print adjectives # We want to save our list of adjectives as a new corpus. # Something more manageable than 24MB. # I prefer a new .csv file with two fields: lemma, and forms (comma-separated). # Adjectives higher up in the list are more frequent, # we should deal with those first to get a good coverage. corpus = Datasheet() for frequency, lemma, forms in adjectives: field1 = lemma field2 = ",".join(forms) # Collapse list to comma-separated string. corpus.append( [field1, field2] ) corpus.save("adj-fr.csv") # We end up with a 500KB list of words commonly used to express emotion or opinion, # sorted by how often they occur in books, # along with their inflected forms (gender/number, such as "belles"). # The top 10 most frequent adjectives are: # "tout", "petit", "grand", "seul", "autre", "meme", "bon", "premier", "beau", "jeune", ...
# we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. prev = None for i in range(2): print(i) for tweet in engine.search("is cooler than", start=prev, count=25, cached=False): print() print(tweet.text.encode("utf-8")) print(tweet.author) print(tweet.date) print(hashtags(tweet.text)) # Keywords in tweets start with a "#". print() # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or tweet.id not in index: table.append([tweet.id, tweet.text]) index.add(tweet.id) # Continue mining older tweets in next iteration. prev = tweet.id # Create a .csv in pattern/examples/01-web/ table.save(pd("cool.csv")) print("Total results:", len(table)) print() # Print all the rows in the table. # Since it is stored as a CSV-file it grows comfortably each time the script runs. # We can also open the table later on: in other scripts, for further # analysis, ...
# 4) Remove HTML tags: try: s = plaintext(e) s = s.strip() except: continue #if not s: # print r.url # print # continue # 5) Save to CSV: if r.url not in seen: seen.add(r.url) csv.append(( name, r.title, s, label, )) print(name, r.title) csv.save(pd(PATH)) # To read the dataset: #for name, label, article in Datasheet.load(PATH): #print(name, label, article) #datasheet = Datasheet.load(PATH) #print(datasheet)
__author__ = 'Nitin' from collocations import get_knowledge_from_collocations from associations import get_knowledge_from_associations from common_sense import get_knowledge_from_pattern_common_sense from wordnet_nyms import get_knowledge_from_wordnet from pattern.db import Datasheet import logging logging.basicConfig(level=logging.INFO) logging.getLogger(__name__) g = [] get_knowledge_from_collocations(g) get_knowledge_from_associations(g) get_knowledge_from_pattern_common_sense(g) #get_knowledge_from_wordnet(g) knowledge = [tuple([head.strip(), tail.strip(), relation]) for head, tail, relation in g if '.' in head and '.' in tail and '/' not in head and '/' not in tail and '(' not in head and '(' not in tail and ')' not in head and ')' not in tail and not head.startswith('.') and not tail.startswith('.')] logging.info('Memorising...') ds = Datasheet() for speck in knowledge: ds.append(speck) ds.save('knowledge.csv')
f.close() bigram_table = Datasheet() all_tokens = [] for row in table: tweet = str(row).lower() tokens = [] for i in range(0,len(word_tokenize(tweet))): if word_tokenize(tweet)[i] == '@': tokens.append( str('@' + word_tokenize(tweet)[i+1]) ) if word_tokenize(tweet)[i] == '#': tokens.append( str('#' + word_tokenize(tweet)[i+1]) ) new_bigrams = nltk.bigrams(tokens) for bigram in new_bigrams: bigram_table.append(bigram) for token in tokens: all_tokens.extend(tokens) token_freq = nltk.FreqDist(all_tokens) str_today = 'tweet_graph_' + str(date.today().day) + '-' + str(date.today().month) + '-' + str(date.today().year) + '.csv' bigram_table.save(str_today) new_twitter_subjects = list(set(all_tokens)) another_table = Datasheet() # save the original list of twitter users, we'll use this in cytoscape spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ', quotechar='|') for i in list(set(all_tokens)): spamWriter.writerow([i, 1])
e("div[class='section-blog-right']")[0]._p.extract() e("div[class='blog-sidebar-links']")[0]._p.extract() e("div[role='complementary']")[0]._p.extract() except: pass # 4) Remove HTML tags: try: s = plaintext(e) s = s.strip() except: continue #if not s: # print r.url # print # continue # 5) Save to CSV: if r.url not in seen: seen.add(r.url) csv.append( (name, label, bias, str(level), r.title, s, r.url, r.date)) print name, r.title print csv.save(pd(PATH)) # To read the dataset: # for name, label, bias, level, title, article, url, date in Datasheet.load(PATH): # level = int(level)
csv = Datasheet() seen = set() twitter = Twitter(language="en", license=None) for name in celebrities(): id = None for tweet in twitter.search(name, start=id, count=100, cached=False): id = tweet.id if id not in seen: print name, tweet.text print seen.add(id) for w in adjectives(tweet.text): if not w.startswith(("@", "~", "1", "2")): # filter out weirdness csv.append([tweet.id, name, w]) csv.save(PATH) # ------------------------------------------------------------------------------------ # Dataset reader. PATH = pd("properties.csv") f = {} # {celebrity: {property: count}} for id, name, p in Datasheet.load(PATH): if name not in f: f[name] = {} # {"Justin Bieber": {}} if p not in f[name]: f[name][p] = 0 # {"Justin Bieber": {"gay": 0}} f[name][p] += 1 # {"Justin Bieber": {"gay": 1}}
# In the first column, we'll store a unique ID for each tweet. # We only want to add the latest facebook status, i.e. those we haven't previously encountered. # With an index on the first column we can quickly check if an ID already exists. # The index becomes important once more and more rows are added to the table (speed). table = Datasheet.load("travel.txt") index = dict.fromkeys(table.columns[0], True) except: table = Datasheet() index = {} engine = Facebook() # With cached=False, a live request is sent to Facebook, # so we get the latest results for the query instead of those in the local cache. for status in engine.search("Travelling to", count=25, cached=False): print status.description print status.author print status.date print id = status.url # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, status.description]) index[id] = True table.save("travel.txt") print "Total results:", len(table) print
engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.description)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.csv") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs. # We can also open the table later on, in other scripts, for further analysis. #pprint(table) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
id = None for i in range(3): # Look for tweets containing the search query. # We can get a maximum of 100 tweets per search. # Don't cache the results locally, # so we get the latest new tweets when the script runs. # Do this 3x. for tweet in twitter.search(q, start=id, count=100, cached=False): id = tweet.id if id not in seen: print tweet.text print seen.add(id) csv.append([ tweet.id, q, tweet.author, tweet.text, tweet.retweets, tweet.date ]) # Update the CSV. csv.save(PATH) # Each time you run the script, new tweets will be appended to the CSV. # For example, we have Twitter miners that automatically run 10x each day, # and have been running for many days and weeks. # We can then import the data in other scripts, e.g.: #from pattern.db import Datasheet, pd #csv = Datasheet.load(pd("tweets.csv")) #for id, q, author, text, retweets, date in csv: # print text
seen = set() twitter = Twitter(language="en", license=None) for name in celebrities(): id = None for tweet in twitter.search(name, start=id, count=100, cached=False): id = tweet.id if id not in seen: print name, tweet.text print seen.add(id) for w in adjectives(tweet.text): if not w.startswith( ("@", "~", "1", "2")): # filter out weirdness csv.append([tweet.id, name, w]) csv.save(PATH) # ------------------------------------------------------------------------------------ # Dataset reader. PATH = pd("properties.csv") f = {} # {celebrity: {property: count}} for id, name, p in Datasheet.load(PATH): if name not in f: f[name] = {} # {"Justin Bieber": {}} if p not in f[name]: f[name][p] = 0 # {"Justin Bieber": {"gay": 0}} f[name][p] += 1 # {"Justin Bieber": {"gay": 1}}
f.close() bigram_table = Datasheet() all_tokens = [] for row in table: tweet = str(row).lower() tokens = [] for i in range(0, len(word_tokenize(tweet))): if word_tokenize(tweet)[i] == '@': tokens.append(str('@' + word_tokenize(tweet)[i + 1])) if word_tokenize(tweet)[i] == '#': tokens.append(str('#' + word_tokenize(tweet)[i + 1])) new_bigrams = nltk.bigrams(tokens) for bigram in new_bigrams: bigram_table.append(bigram) for token in tokens: all_tokens.extend(tokens) token_freq = nltk.FreqDist(all_tokens) str_today = 'tweet_graph_' + str(date.today().day) + '-' + str( date.today().month) + '-' + str(date.today().year) + '.csv' bigram_table.save(str_today) new_twitter_subjects = list(set(all_tokens)) another_table = Datasheet() # save the original list of twitter users, we'll use this in cytoscape spamWriter = csv.writer(open('original_twitter.csv', 'wb'), delimiter=' ',
import re from pattern.web import Twitter,hashtags,author from pattern.db import Datasheet, pprint, pd from pattern.web import Google, plaintext table = Datasheet() index = set() texts = set() table.append(['Tweets']) ''' Cleaning the tweets extracted from a particular user timeline, code is available in another file ''' with open('extracted_tweets_translated.txt') as f: for tweet in f.readlines(): clean_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",tweet).split()) print print clean_text print author(tweet) print hashtags(tweet) print
# Data Science is a field in computer science that is dedicated to analyzing patterns in raw data using # techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and # statistical algorithms. # Pattern is a web mining module for the Python programming language. # It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural # language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning # (vector space model, clustering, SVM), network analysis and <canvas> visualization. # Twitter opinion Mining. from pattern.web import Twitter, plaintext from pattern.db import Datasheet from pattern.nl import sentiment as sentiment_nl from pattern.fr import sentiment as sentiment_fr csv = Datasheet() for politician, party in (("bart de wever", "NV-A"), ("elio di rupo", "PS")): for tweet in Twitter().search(politician): if tweet.language in ("nl", "fr"): s = plaintext(tweet.description) if tweet.language == "nl": w = sentiment_nl(s) if tweet.language == "fr": w = sentiment_fr(s) csv.append([politician, party, tweet.date, s, w])
seen = set(csv.columns[0]) except: csv = Datasheet() seen = set() for genre, url in feeds.items(): for r in Newsfeed().search(url, cached=False): if r.url not in seen: print r.title print try: src = URL(r.url).download(cached=True) dom = DOM(src) txt = [] # Daily Star has untidy HTML markup. # Collect the article <p> by <p>. for p in dom('.story-content p'): if p.parent.tag == 'blockquote': continue s = plaintext(p) s = s.strip() if s != s.upper( ): # Exclude references ("GETTY", "YOUTUBE"). txt.append(s) seen.add(r.url) csv.append((r.url, r.title, '\n\n'.join(txt), genre)) except: pass csv.save(PATH)
index2 = dict.fromkeys(table.columns[1], True) except: table = Datasheet() index = {} index2 = {} engine = Twitter(language="en") comparray=[" "] #spam filter # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for i in range(1, 10000): for tweet in Twitter().search('volcano OR Sicily AND etna +exclude:retweets', start=i, count=100): comparray.append(tweet.text[0:15]) print tweet.text print tweet.author print tweet.date print hashtags(tweet.text) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = str(hash(tweet.author + tweet.text)) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index and sentiment(tweet.text)[0]!= 0 and comparray[-1]!=comparray[-2]: table.append([id,tweet.author, tweet.date, tweet.text, sentiment(tweet.text)[0]]) index[id] = True table.save("tweets.csv") print "Total results:", len(table) print
# "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Bing and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Bing search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) d = Datasheet() engine = Bing(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i+1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[ 0] # Right NP. d.append(( a.string.lower(), b.string.lower())) pprint(d) print print len(d), "results."
engine = Twitter(language="en") # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author. id = hash(tweet.author + tweet.description) # Only add the tweet to the table if it doesn't already contain this ID. if len(table) == 0 or id not in index: table.append([id, tweet.description]) index[id] = True table.save("cool.txt") print "Total results:", len(table) print # Print all the rows in the table. # Since it is stored as a file it can grow comfortably each time the script runs. # We can also open the table later on, in other scripts, for further analysis. #pprint(table) # Note: you can also search tweets by author: # Twitter().search("from:tom_de_smedt")
# The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">. score = review.by_class("swSprite")[0] score = score.attributes["title"] score = score.split(" ")[0] score = float(score) # The review is contained as plain text in the <div>. text = "" for child in review.children: if child.type == "text": text += child.source + " " text = text.strip() text = plaintext(text) # Remove HTML entities, tags, etc. if text: corpus.append((text, score)) print score print text print except Exception, e: #print e pass # Now and then, save the corpus of (review, score) items as a .csv file. corpus.save("books-fr.csv") # Can you think of other test data to mine for? # Can you see why it would be useful to have different test sets? # - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag? # - How about hotel reviews + star rating from http://fr.hotels.com?
# With Facebook.search(cached=False), a "live" request is sent to Facebook: # we get the most recent results instead of those in the local cache. # Keeping a local cache can also be useful (e.g., while testing) # because a query is instant when it is executed the second time. for status in fb.search("horrible", count=25, cached=False): print("=" * 100) print(status.id) print(status.text.encode("utf-8")) print(status.author) # Yields an (id, name)-tuple. print(status.date) print(status.likes) print(status.comments) print() # Only add the tweet to the table if it doesn't already exists. if len(table) == 0 or status.id not in index: table.append([status.id, status.text]) index.add(status.id) # Create a .csv in pattern/examples/01-web/ table.save(pd("opinions.csv")) # 2) Status updates from specific profiles. # For this you need a personal license key: # http://www.clips.ua.ac.be/pattern-facebook license = "" if license != "": fb = Facebook(license) # Facebook.profile() returns a dictionary with author info. # By default, this is your own profile.
# The star rating is <span class="swSprite s_star_5_0 " title="5.0 etoiles sur 5">. score = review.by_class("swSprite")[0] score = score.attributes["title"] score = score.split(" ")[0] score = float(score) # The review is contained as plain text in the <div>. text = "" for child in review.children: if child.type == "text": text += child.source + " " text = text.strip() text = plaintext(text) # Remove HTML entities, tags, etc. if text: corpus.append((text, score)) print score print text print except Exception, e: #print e pass # Now and then, save the corpus of (review, score) items as a .csv file. corpus.save("books-fr.csv") # Can you think of other test data to mine for? # Can you see why it would be useful to have different test sets? # - Instead of book reviews + star rating, how about tweets + #win or #fail hashtag? # - How about hotel reviews + star rating from http://fr.hotels.com?