from pattern.web import Twitter, Google, plaintext from pattern.table import Table t = Table() for nomme, categorie in (("l'arnacoeur", "film"), ("le nom des gens", "film"), ("the ghost writer", "film"), ("tournée", "film"), ("des hommes et des dieux", "film"), ("gainsbourg, vie héroique", "film"), ("mammuth", "film")): for tweet in Twitter().search(nomme): s = plaintext(tweet.description) t.append([nomme, film, tweet.date, s])
from pattern.table import Table from pattern.table import uid, pprint # The main purpose of the pattern module is to facilitate automated processes # for (text) data acquisition and (linguistical) data mining. # Often, this involves a tangle of messy text files and custom formats to store the data. # The Table class offers a useful datasheet (cfr. MS Excel) in Python code. # It can be saved as a CSV text file that is both human/machine readable. # See also: examples/01-web/03-twitter.py # Supported values that are imported and exported correctly: # str, unicode, int, float, bool, None # For other data types, custom encoder and decoder functions can be used. t = Table(rows=[ [uid(), "broccoli", "vegetable"], [uid(), "turnip", "vegetable"], [uid(), "asparagus", "vegetable"], [uid(), "banana", "fruit" ], ]) print t.rows[0] # A list of rows. print t.columns[1] # A list of columns, where each column is a list of values. print # Columns can be manipulated directly like any other Python list. # This can be slow for large tables. If you need a fast way to do matrix math, # use numpy (http://numpy.scipy.org/) instead. # The purpose of Table is data storage. t.columns.append([ "green", "purple", "white",
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.table import Table from pattern.table import uid, pprint, COUNT, FIRST # This example demonstrates how table values can be grouped. t = Table(rows=[ # 0-ID 1-NAME 2-TYPE 3-COLOR [uid(), "broccoli", "vegetable", "green"], [uid(), "turnip", "vegetable", "purple"], [uid(), "asparagus", "vegetable", "white"], [uid(), "banana", "fruit", "yellow"], [uid(), "orange", "fruit", "orange"] ]) g = t.copy(columns=[2, 0]) # A copy with only the type and id columns. g = g.group(0, COUNT) # Group by type, count rows per type. # Group functions: FIRST, LAST, COUNT, MAX, MIN, SUM, AVG, STDEV. pprint(g) print # This will group by type and concatenate all names per type: g = t.copy(columns=[2, 1]) g = g.group(0, function=lambda list: "/".join(list)) pprint(g) print
import os, sys sys.path.append(os.path.join("..", "..", "..")) from pattern.web import Twitter, hashtags from pattern.table import Table, pprint # This example retrieves tweets containing given keywords from Twitter (http://twitter.com). try: # We store tweets in a Table that can be saved as a text file. # In the first column, we'll store a unique ID for each tweet. # We only want to add the latest tweets, i.e. those we haven't previously encountered. # With an index() on the first column we can quickly check if an ID already exists. # The index becomes important once more and more rows are added to the table (speed). table = Table.load("cool.txt") index = table.index(table.columns[0]) except: table = Table() index = {} engine = Twitter() # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print
import os, sys; sys.path.append(os.path.join("..", "..", "..")) from pattern.table import Table from pattern.table import uid, pprint, COUNT, FIRST # This example demonstrates how table values can be grouped. t = Table(rows=[ # 0-ID 1-NAME 2-TYPE 3-COLOR [uid(), "broccoli", "vegetable", "green" ], [uid(), "turnip", "vegetable", "purple"], [uid(), "asparagus", "vegetable", "white" ], [uid(), "banana", "fruit", "yellow"], [uid(), "orange", "fruit", "orange"] ]) g = t.copy(columns=[2,0]) # A copy with only the type and id columns. g = g.group(0, COUNT) # Group by type, count rows per type. # Group functions: FIRST, LAST, COUNT, MAX, MIN, SUM, AVG, STDEV. pprint(g) print # This will group by type and concatenate all names per type: g = t.copy(columns=[2,1]) g = g.group(0, function=lambda list: "/".join(list)) pprint(g) print # This will group by type, count the id's per type, and concatenate all names per type. # Each column is given a different grouping function.
from pattern.search import Pattern from pattern.table import Table, pprint # "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Yahoo! and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Yahoo search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) t = Table() engine = Yahoo(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i+1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[ 0] # Right NP. t.append(( a.string.lower(), b.string.lower())) pprint(t)
from pattern.search import Pattern from pattern.table import Table, pprint # "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Yahoo! and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Yahoo search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) t = Table() engine = Yahoo(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i + 1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[0] # Right NP. t.append((a.string.lower(), b.string.lower())) pprint(t) print
from pattern.web import Twitter, Google, plaintext from pattern.table import Table t = Table() for politician, party in (("nicolas sarkozy", "ump"), ("dsk", "ps")): for tweet in Twitter().search(politician): if tweet.language in ("nl", "fr"): s = plaintext(tweet.description) s = Google().translate(s, tweet.language, "en") # w = sum([sentiment_score(word) for word in s.split(" ")]) t.append([politician, party, tweet.date, s])
import os, sys sys.path.append(os.path.join("..", "..", "..")) from pattern.web import Twitter, hashtags from pattern.table import Table, pprint # This example retrieves tweets containing given keywords from Twitter (http://twitter.com). try: # We store tweets in a Table that can be saved as a text file. # In the first column, we'll store a unique ID for each tweet. # We only want to add the latest tweets, i.e. those we haven't previously encountered. # With an index() on the first column we can quickly check if an ID already exists. # The index becomes important once more and more rows are added to the table (speed). table = Table.load("cool.txt") index = table.index(table.columns[0]) except: table = Table() index = {} engine = Twitter() # With cached=False, a live request is sent to Twitter, # so we get the latest results for the query instead of those in the local cache. for tweet in engine.search("is cooler than", count=25, cached=False): print tweet.description print tweet.author print tweet.date print hashtags(tweet.description) # Keywords in tweets start with a #. print # Create a unique ID based on the tweet content and author.
from pattern.table import Table from pattern.table import uid, pprint # The main purpose of the pattern module is to facilitate automated processes # for (text) data acquisition and (linguistical) data mining. # Often, this involves a tangle of messy text files and custom formats to store the data. # The Table class offers a useful datasheet (cfr. MS Excel) in Python code. # It can be saved as a CSV text file that is both human/machine readable. # See also: examples/01-web/03-twitter.py # Supported values that are imported and exported correctly: # str, unicode, int, float, bool, None # For other data types, custom encoder and decoder functions can be used. t = Table(rows=[ [uid(), "broccoli", "vegetable"], [uid(), "turnip", "vegetable"], [uid(), "asparagus", "vegetable"], [uid(), "banana", "fruit"], ]) print t.rows[0] # A list of rows. print t.columns[1] # A list of columns, where each column is a list of values. print # Columns can be manipulated directly like any other Python list. # This can be slow for large tables. If you need a fast way to do matrix math, # use numpy (http://numpy.scipy.org/) instead. # The purpose of Table is data storage. t.columns.append(["green", "purple", "white", "yellow"]) # Save as a comma-separated (unicode) text file. t.save("food.txt")