Пример #1
0
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.

    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    query_file (str) -- If query file is provided should be absolute path to text file.
    """

    if not db_name:
        d = datetime.datetime.now()
        #if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if query_file:
        if not os.path.exists(query_file):
            return "Query file path does not exist."

        f = open(query_file)
        words = [line.strip() for line in f.readlines()]
        label = words[0]
        for w in words:
            queries[w] = label

    else:
        queries[':)'] =  'positive'
        queries[':('] =  'negative'

    #collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item['text'])

        sentiment = queries.get(item['query'], None)

        if sentiment:
            try:
                cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError: #skip duplicates
                continue

    db.close()
Пример #2
0
def collect(db_name="", commit_every=1000, max_collect=400000, queries_file=""):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.
    
    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    queries_file (str) -- If queries file is provided should be a path to a text file
                          containing the queries in the format:
                          
                          label 
                          query1
                          queryN

    """

    if not db_name:
        d = datetime.datetime.now()
        # if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if queries_file:
        try:
            f = open(queries_file)
            words = [line.strip() for line in f.readlines()]
            label = words[0]
            for w in words:
                queries[w] = label
        except IOError:
            pass

    else:
        queries[":)"] = "positive"
        queries[":("] = "negative"

    # collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item["text"])

        sentiment = queries.get(item["query"], None)

        if sentiment:
            try:
                cursor.execute("INSERT INTO item VALUES (NULL,?,?)", [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError:  # skip duplicates
                continue

    db.close()
Пример #3
0
import kral

queries = ['android','bitcoin']

services = ['twitter','facebook']

for item in kral.stream(queries,services):
    print(item.service,item.text)
    
Пример #4
0
def collect(db_name='', commit_every=1000, max_collect=400000, query_file=''):
    """
    Will continuously populate the sample database if it exists
    else it will create a new one.

    Keyword Arguments:
    db_name (str) -- Custom name for database.
    commit_every (int) -- Commit to sqlite after commit_every executes.
    max_collect (int) -- Will stop collecting at this number.
    query_file (str) -- If query file is provided should be absolute path to text file.
    """

    #collect requires kral
    try:
        from kral import stream
    except ImportError:
        raise ImportError("Requires the kral package in order to collect.")

    if not db_name:
        d = datetime.datetime.now()
        #if no dbname is provided we'll store a timestamped db name
        db_name = "samples-%s-%s-%s.db" % (d.year, d.month, d.day)

    db = db_init(db=db_name)
    cursor = db.cursor()

    queries = {}

    if query_file:
        if not os.path.exists(query_file):
            return "Query file path does not exist."

        f = open(query_file)
        words = [line.strip() for line in f.readlines()]
        label = words[0]
        for w in words:
            queries[w] = label

    else:
        queries[':)'] =  'positive'
        queries[':('] =  'negative'

    #collect on twitter with kral
    g = stream(query_list=queries.keys(), service_list="twitter")

    c = 0
    for item in g:

        text = unicode(item['text'])

        sentiment = queries.get(item['query'], None)

        if sentiment:
            try:
                cursor.execute('INSERT INTO item VALUES (NULL,?,?)', [text, sentiment])
                c += 1
                if c % commit_every == 0:
                    db.commit()
                    print("Commited {}".format(commit_every))
                if c == max_collect:
                    break
            except IntegrityError: #skip duplicates
                continue

    db.close()
Пример #5
0
 def test_service_buzz(self):
     l = list(islice(stream(self.queries[0], "buzz"), 5))
     self.assertEqual(5, len(l), "Failed to stream from Buzz")
Пример #6
0
 def test_service_twitter(self):
     l = list(islice(stream(self.queries[0], "twitter"), 5))
     self.assertEqual(5, len(l), "Failed to stream from Twitter")
Пример #7
0
 def test_service_identica(self):
     l = list(islice(stream(self.queries[0], "identica"), 5))
     self.assertEqual(5, len(l), "Failed to stream from Identica")
Пример #8
0
 def test_query_single(self):
     l = list(islice(stream(self.queries[0]), 5))
     self.assertEqual(5, len(l), "Failed to stream with a single query")
Пример #9
0
 def test_service_facebook(self):
     l = list(islice(stream(self.queries[0], "facebook"), 5))
     self.assertEqual(5, len(l), "Failed to stream from Facebook")
Пример #10
0
 def test_service_buzz(self):
     l = list(islice(stream(self.queries[0], 'buzz'), 5))
     self.assertEqual(5, len(l), 'Failed to stream from Buzz')
Пример #11
0
 def test_query_list(self):
     l = list(islice(stream(self.queries), 5))
     self.assertEqual(5, len(l), "Failed to stream with list of queries")
Пример #12
0
 def test_service_identica(self):
     l = list(islice(stream(self.queries[0], 'identica'), 5))
     self.assertEqual(5, len(l), 'Failed to stream from Identica')
Пример #13
0
 def test_service_twitter(self):
     l = list(islice(stream(self.queries[0], 'twitter'), 5))
     self.assertEqual(5, len(l), 'Failed to stream from Twitter')
Пример #14
0
 def test_service_facebook(self):
     l = list(islice(stream(self.queries[0], 'facebook'), 5))
     self.assertEqual(5, len(l), 'Failed to stream from Facebook')
Пример #15
0
 def test_query_single(self):
     l = list(islice(stream(self.queries[0]), 5))
     self.assertEqual(5, len(l), 'Failed to stream with a single query')
Пример #16
0
 def test_query_list(self):
     l = list(islice(stream(self.queries), 5))
     self.assertEqual(5, len(l), 'Failed to stream with list of queries')