def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. query_file (str) -- If query file is provided should be absolute path to text file. """ if not db_name: d = datetime.datetime.now() #if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if query_file: if not os.path.exists(query_file): return "Query file path does not exist." f = open(query_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label else: queries[':)'] = 'positive' queries[':('] = 'negative' #collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item['text']) sentiment = queries.get(item['query'], None) if sentiment: try: cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: #skip duplicates continue db.close()
def collect(db_name="", commit_every=1000, max_collect=400000, queries_file=""): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. queries_file (str) -- If queries file is provided should be a path to a text file containing the queries in the format: label query1 queryN """ if not db_name: d = datetime.datetime.now() # if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if queries_file: try: f = open(queries_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label except IOError: pass else: queries[":)"] = "positive" queries[":("] = "negative" # collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item["text"]) sentiment = queries.get(item["query"], None) if sentiment: try: cursor.execute("INSERT INTO item VALUES (NULL,?,?)", [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: # skip duplicates continue db.close()
import kral queries = ['android','bitcoin'] services = ['twitter','facebook'] for item in kral.stream(queries,services): print(item.service,item.text)
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''): """ Will continuously populate the sample database if it exists else it will create a new one. Keyword Arguments: db_name (str) -- Custom name for database. commit_every (int) -- Commit to sqlite after commit_every executes. max_collect (int) -- Will stop collecting at this number. query_file (str) -- If query file is provided should be absolute path to text file. """ #collect requires kral try: from kral import stream except ImportError: raise ImportError("Requires the kral package in order to collect.") if not db_name: d = datetime.datetime.now() #if no dbname is provided we'll store a timestamped db name db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day) db = db_init(db=db_name) cursor = db.cursor() queries = {} if query_file: if not os.path.exists(query_file): return "Query file path does not exist." f = open(query_file) words = [line.strip() for line in f.readlines()] label = words[0] for w in words: queries[w] = label else: queries[':)'] = 'positive' queries[':('] = 'negative' #collect on twitter with kral g = stream(query_list=queries.keys(), service_list="twitter") c = 0 for item in g: text = unicode(item['text']) sentiment = queries.get(item['query'], None) if sentiment: try: cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment]) c += 1 if c % commit_every == 0: db.commit() print("Commited {}".format(commit_every)) if c == max_collect: break except IntegrityError: #skip duplicates continue db.close()
def test_service_buzz(self): l = list(islice(stream(self.queries[0], "buzz"), 5)) self.assertEqual(5, len(l), "Failed to stream from Buzz")
def test_service_twitter(self): l = list(islice(stream(self.queries[0], "twitter"), 5)) self.assertEqual(5, len(l), "Failed to stream from Twitter")
def test_service_identica(self): l = list(islice(stream(self.queries[0], "identica"), 5)) self.assertEqual(5, len(l), "Failed to stream from Identica")
def test_query_single(self): l = list(islice(stream(self.queries[0]), 5)) self.assertEqual(5, len(l), "Failed to stream with a single query")
def test_service_facebook(self): l = list(islice(stream(self.queries[0], "facebook"), 5)) self.assertEqual(5, len(l), "Failed to stream from Facebook")
def test_service_buzz(self): l = list(islice(stream(self.queries[0], 'buzz'), 5)) self.assertEqual(5, len(l), 'Failed to stream from Buzz')
def test_query_list(self): l = list(islice(stream(self.queries), 5)) self.assertEqual(5, len(l), "Failed to stream with list of queries")
def test_service_identica(self): l = list(islice(stream(self.queries[0], 'identica'), 5)) self.assertEqual(5, len(l), 'Failed to stream from Identica')
def test_service_twitter(self): l = list(islice(stream(self.queries[0], 'twitter'), 5)) self.assertEqual(5, len(l), 'Failed to stream from Twitter')
def test_service_facebook(self): l = list(islice(stream(self.queries[0], 'facebook'), 5)) self.assertEqual(5, len(l), 'Failed to stream from Facebook')
def test_query_single(self): l = list(islice(stream(self.queries[0]), 5)) self.assertEqual(5, len(l), 'Failed to stream with a single query')
def test_query_list(self): l = list(islice(stream(self.queries), 5)) self.assertEqual(5, len(l), 'Failed to stream with list of queries')