Пример #1
0
 def test_time(self):
     # Assert Date + time().
     v = db.date("2010-09-21 9:27:00")
     v = v - db.time(days=1, hours=1, minutes=1, seconds=1)
     self.assertEqual(str(v), "2010-09-20 08:25:59")
     # Assert Date + time(years, months)
     v = db.date(2014, 1, 31)
     v = v + db.time(years=1, months=1)
     self.assertEqual(str(v), "2015-02-28 00:00:00")
     print("pattern.db.time()")
Пример #2
0
 def test_time(self):
     # Assert Date + time().
     v = db.date("2010-09-21 9:27:00")
     v = v - db.time(days=1, hours=1, minutes=1, seconds=1)
     self.assertEqual(str(v), "2010-09-20 08:25:59")
     # Assert Date + time(years, months)
     v = db.date(2014, 1, 31)
     v = v + db.time(years=1, months=1)
     self.assertEqual(str(v), "2015-02-28 00:00:00")
     print("pattern.db.time()")
Пример #3
0
 def setUp(self):
     # Create test table.
     self.csv = db.CSV(
         rows=[
             [u"Schrödinger", "cat", True, 3, db.date(2009, 11, 3)],
             [u"Hofstadter", "labrador", True, 5, db.date(2007, 8, 4)]
         ],
         fields=[
             ["name", db.STRING],
             ["type", db.STRING],
             ["tail", db.BOOLEAN],
             [ "age", db.INTEGER],
             ["date", db.DATE],
         ])
Пример #4
0
 def test_format(self):
     # Assert custom input formats.
     v = db.date("2010-09", "%Y-%m")
     self.assertEqual(str(v), "2010-09-01 00:00:00")
     self.assertEqual(v.year, 2010)
     # Assert custom output formats.
     v = db.date("2010-09", "%Y-%m", format="%Y-%m")
     self.assertEqual(v.format, "%Y-%m")
     self.assertEqual(str(v), "2010-09")
     self.assertEqual(v.year, 2010)
     # Assert strftime() for date < 1900.
     v = db.date(1707, 4, 15)
     self.assertEqual(str(v), "1707-04-15 00:00:00")
     self.assertRaises(ValueError, lambda: v.timestamp)
     print "pattern.db.Date.__str__()"
Пример #5
0
class TestDatabase(unittest.TestCase):
    def setUp(self):
        # Define self.db and self.type in a subclass.
        pass

    def tearDown(self):
        for table in self.db:
            self.db.drop(table)

    def test_escape(self):
        # Assert str, unicode, int, long, float, bool and None field values.
        for v, s in (("a", "'a'"), (u"a", "'a'"), (1, "1"), (1L, "1"),
                     (1.0, "1.0"), (True, "1"), (False, "0"), (None, "null")):
            self.assertEqual(db._escape(v), s)
        # Assert date.
        v = db.date("1707-04-15")
        self.assertEqual(db._escape(v), "'1707-04-15 00:00:00'")
        # Assert current date.
        v = "current_timestamp"
        self.assertEqual(db._escape(v), "current_timestamp")
        # Assert subquery.
        v = self.db.create("dummy", fields=[db.pk()])
        v = v.query()
        self.assertEqual(db._escape(v), "(select dummy.* from `dummy`)")
        # Assert MySQL and SQLite quotes.
        if self.db.type == db.MYSQL:
            self.assertEqual(self.db.escape("'"), "'\\''")
        if self.db.type == db.SQLITE:
            self.assertEqual(self.db.escape("'"), "''''")
        print "pattern.db._escape()"
Пример #6
0
 def test_escape(self):
     # Assert str, unicode, int, long, float, bool and None field values.
     for v, s in (
       (   "a", "'a'"),
       (     1, "1"),
       (int(1), "1"),
       (   1.0, "1.0"),
       (  True, "1"),
       ( False, "0"),
       (  None, "null")):
         self.assertEqual(db._escape(v), s)
     # Assert date.
     v = db.date("1707-04-15")
     self.assertEqual(db._escape(v), "'1707-04-15 00:00:00'")
     # Assert current date.
     v = "current_timestamp"
     self.assertEqual(db._escape(v), "current_timestamp")
     # Assert subquery.
     v = self.db.create("dummy", fields=[db.pk()])
     v = v.query()
     self.assertEqual(db._escape(v), "(select dummy.* from `dummy`)")
     # Assert MySQL and SQLite quotes.
     if self.db.type == db.MYSQL:
         self.assertEqual(self.db.escape("'"), "'\\''")
     if self.db.type == db.SQLITE:
         self.assertEqual(self.db.escape("'"), "''''")
     print("pattern.db._escape()")
Пример #7
0
 def test_escape(self):
     # Assert str, unicode, int, long, float, bool and None field values.
     for v, s in (
       (   "a", "'a'"),
       (     1, "1"),
       (int(1), "1"),
       (   1.0, "1.0"),
       (  True, "1"),
       ( False, "0"),
       (  None, "null")):
         self.assertEqual(db._escape(v), s)
     # Assert date.
     v = db.date("1707-04-15")
     self.assertEqual(db._escape(v), "'1707-04-15 00:00:00'")
     # Assert current date.
     v = "current_timestamp"
     self.assertEqual(db._escape(v), "current_timestamp")
     # Assert subquery.
     v = self.db.create("dummy", fields=[db.pk()])
     v = v.query()
     self.assertEqual(db._escape(v), "(select dummy.* from `dummy`)")
     # Assert MySQL and SQLite quotes.
     if self.db.type == db.MYSQL:
         self.assertEqual(self.db.escape("'"), "'\\''")
     if self.db.type == db.SQLITE:
         self.assertEqual(self.db.escape("'"), "''''")
     print("pattern.db._escape()")
Пример #8
0
    def test_timestamp(self):
        # Assert Date.timestamp.
        if True:
            raise unittest.SkipTest("FIXME see GH issue 94.")

        v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT)
        self.assertEqual(v.timestamp, 1285041600)
        print("pattern.db.Date.timestamp")
Пример #9
0
    def test_timestamp(self):
        # Assert Date.timestamp.
        if True:
            raise unittest.SkipTest("FIXME see GH issue 94.")

        v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT)
        self.assertEqual(v.timestamp, 1285020000)
        print("pattern.db.Date.timestamp")
Пример #10
0
 def test_date(self):
     # Assert string input and default date formats.
     for s in (
       "2010-09-21 09:27:01",
       "2010-09-21T09:27:01Z",
       "2010-09-21T09:27:01+0000",
       "2010-09-21 09:27",
       "2010-09-21",
       "21/09/2010",
       "21 September 2010",
       "September 21 2010",
       "September 21, 2010",
       1285054021):
         v = db.date(s)
         self.assertEqual(v.format, "%Y-%m-%d %H:%M:%S")
         self.assertEqual(v.year,   2010)
         self.assertEqual(v.month,  9)
         self.assertEqual(v.day,    21)
     # Assert NOW.
     for v in (db.date(), db.date(db.NOW)):
         self.assertEqual(v.year,  datetime.datetime.now().year)
         self.assertEqual(v.month, datetime.datetime.now().month)
         self.assertEqual(v.day,   datetime.datetime.now().day)
     self.assertEqual(db.date().year, db.YEAR)
     # Assert integer input.
     v1 = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT)
     v2 = db.date(2010, 9, 21, 9, 27, 1, 0, db.DEFAULT_DATE_FORMAT)
     v3 = db.date(2010, 9, 21, hour=9, minute=27, second=01, format=db.DEFAULT_DATE_FORMAT)
     self.assertEqual(str(v1), "2010-09-21 00:00:00")
     self.assertEqual(str(v2), "2010-09-21 09:27:01")
     self.assertEqual(str(v3), "2010-09-21 09:27:01")
     # Assert DateError for other input.
     self.assertRaises(db.DateError, db.date, None)
     print "pattern.db.date()"
Пример #11
0
 def test_csv(self):
     # Assert saving and loading data (field types are preserved).
     v = self.csv
     v.save("test.csv", headers=True)
     v = db.CSV.load("test.csv", headers=True)
     self.assertTrue(isinstance(v, list))
     self.assertTrue(v.headers[0] == (u"name", db.STRING))
     self.assertTrue(v[0] == [u"Schrödinger", "cat", True, 3, db.date(2009, 11, 3)])
     os.unlink("test.csv")
     print "pattern.db.CSV"
     print "pattern.db.CSV.save()"
     print "pattern.db.CSV.load()"
Пример #12
0
 def test_group(self):
     # Assert WHERE with AND/OR combinations from Group object().
     yesterday  = db.date()
     yesterday -= db.time(days=1)
     g1 = db.Group(("name", "garlic bread"))
     g2 = db.Group(("name", "pizza"), ("price", 10, "<"), operator=db.AND)
     g3 = db.Group(g1, g2, operator=db.OR)
     g4 = db.Group(g3, ("date", yesterday, ">"), operator=db.AND)
     self.assertEqual(g1.SQL(), "name='garlic bread'")
     self.assertEqual(g2.SQL(), "name='pizza' and price<10")
     self.assertEqual(g3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)")
     self.assertEqual(g4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday)
     # Assert subquery in group.
     q = self._query(fields=["name"])
     g = db.any(("name", u"Gödel"), ("name", q))
     self.assertEqual(g.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)")
     print "pattern.db.Group"
Пример #13
0
 def test_filterchain(self):
     # Assert WHERE with AND/OR combinations from FilterChain object().
     yesterday  = db.date()
     yesterday -= db.time(days=1)
     f1 = db.FilterChain(("name", "garlic bread"))
     f2 = db.FilterChain(("name", "pizza"), ("price", 10, "<"), operator=db.AND)
     f3 = db.FilterChain(f1, f2, operator=db.OR)
     f4 = db.FilterChain(f3, ("date", yesterday, ">"), operator=db.AND)
     self.assertEqual(f1.SQL(), "name='garlic bread'")
     self.assertEqual(f2.SQL(), "name='pizza' and price<10")
     self.assertEqual(f3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)")
     self.assertEqual(f4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday)
     # Assert subquery in filter chain.
     q = self._query(fields=["name"])
     f = db.any(("name", u"Gödel"), ("name", q))
     self.assertEqual(f.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)")
     print "pattern.db.FilterChain"
Пример #14
0
 def test_filterchain(self):
     # Assert WHERE with AND/OR combinations from FilterChain object().
     yesterday  = db.date()
     yesterday -= db.time(days=1)
     f1 = db.FilterChain(("name", "garlic bread"))
     f2 = db.FilterChain(("name", "pizza"), ("price", 10, "<"), operator=db.AND)
     f3 = db.FilterChain(f1, f2, operator=db.OR)
     f4 = db.FilterChain(f3, ("date", yesterday, ">"), operator=db.AND)
     self.assertEqual(f1.SQL(), "name='garlic bread'")
     self.assertEqual(f2.SQL(), "name='pizza' and price<10")
     self.assertEqual(f3.SQL(), "(name='garlic bread') or (name='pizza' and price<10)")
     self.assertEqual(f4.SQL(), "((name='garlic bread') or (name='pizza' and price<10)) and date>'%s'" % yesterday)
     # Assert subquery in filter chain.
     q = self._query(fields=["name"])
     f = db.any(("name", u"Gödel"), ("name", q))
     self.assertEqual(f.SQL(), u"name='Gödel' or name in (select persons.name from `persons`)")
     print("pattern.db.FilterChain")
Пример #15
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse 
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0] # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)" # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
Пример #16
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)"  # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0]  # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r)  # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)"  # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
Пример #17
0
    def insertWebpage(self, page, connection=False):
        idDomain = None
        dateVisited = None

        if page.url.domain:
            self.insertDomain(page.url.domain)
            idDomain = self.getDomainId(page.url.domain)

        if connection:
            dateVisited = date(NOW)
        try:
            self.db.websites.append(address=page.url.string,
                                    domain_id=idDomain,
                                    connected=connection,
                                    lastVisited=dateVisited)
            self.insertRelation(page.parent, page)
        except sqlite3.IntegrityError:
            if connection:
                self.db.websites.update(all(eq('address', page.url.string)),
                                        connected=True,
                                        lastVisited=dateVisited)
Пример #18
0
 def test_time(self):
     # Assert Date + time().
     v = db.date("2010-09-21 9:27:00")
     v = v - db.time(days=1, hours=1, minutes=1, seconds=1)
     self.assertEqual(str(v), "2010-09-20 08:25:59")
     print "pattern.db.time()"
Пример #19
0
 def test_timestamp(self):
     # Assert Date.timestamp.
     v = db.date(2010, 9, 21, format=db.DEFAULT_DATE_FORMAT)
     self.assertEqual(v.timestamp, 1285020000)
     print "pattern.db.Date.timestamp"
Пример #20
0
# Pattern is a web mining module for the Python programming language.
# It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning
# (vector space model, clustering, SVM), network analysis and <canvas> visualization.
# Web mining.
# A simple web mining technique.

from pattern.web import Newsfeed, plaintext
from pattern.db import date
from pattern.vector import Model, Document, LEMMA

news, url = {}, 'http://news.google.com/news?output=rss'

for story in Newsfeed().search(url, cached=False):

    d = str(date(story.date, format='%Y-%m-%d'))
    s = plaintext(story.description)

    # Each key in the news dictionary is a date: news is grouped per day.
    # Each value is a dictionary of id => story items.
    # We use hash(story.description) as a unique id to avoid duplicate content.

    news.setdefault(d, {})[hash(s)] = s

# Your code will probably have some preprocessing steps to save and load the mined news updates.

m = Model()

for date, stories in news.items():
    s = stories.values()
    s = ' '.join(s).lower()
Пример #21
0
from builtins import str, bytes, dict, int

import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.db import date, time, NOW
from pattern.web import Bing, NEWS

# It is often useful to keep a date stamp for each row in the table.
# The pattern.db module's date() function can be used for this.
# It is a simple wrapper around Python's datetime.datetime class,
# with extra functionality to make it easy to parse or print it as a string.

print(date(NOW))
print(date())
print(date("2010-11-01 16:30", "%Y-%m-%d %H:%M"))
print(date("Nov 1, 2010", "%b %d, %Y"))
print(date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y"))
print("")

# All possible formatting options:
# http://docs.python.org/library/time.html#time.strftime

for r in Bing(license=None, language="en").search("today", type=NEWS):
    print(r.title)
    print(repr(r.date))  # Result.date is a string (e.g. we can't > <= += with the date).
    print(date(r.date))  # date() can parse any Result.date in the web module.
    print("")
Пример #22
0
def oringinalTwInsert(filename,db):
    f=open(filename,'rb')
    forigin=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/orgin.txt','r')
    conn = MySQLdb.connect(host='localhost',user='******',passwd='20090924',charset='utf8')
    cursor = conn.cursor()
    DB_NAME = 'tangdb'
    conn.select_db(DB_NAME)
    originalist=[line.strip().split() for line in forigin.readlines()]
    originaldict={}    
    for l in originalist:
        try:
            originaldict[l[0]]=l[1:]
        except:
            print 'Duplicate Key'
    t=True
    c=list(['@','link'])
    i=1
    orstr=f.readline()
    orlist=[]
    while t:
        try:            
            instr=zh2unicode(orstr.strip())
            if instr!=None:
#                print instr
                ortw=instr.split()                
                ortwid=ortw[0]
                try:
                    origlist=originaldict[ortwid]
                except:
                    print 'key error: '+str(ortwid)
                    exit(0)
                ortwuid=origlist[0]
                orcreAt=date(origlist[1])
                orRt=origlist[2]
                rt=origlist[3]
                ortweet=zh2unicode(f.readline().strip())                
                orstr=zh2unicode(f.readline().strip())
                mention=''
                link=''
                while orstr.split()[0] in c:
                    if orstr.split()[0]=='@':
                        mention=' '.join(rtstr.split()[1:])
                    if orstr.split()[0]=='link':
                        link=' '.join(rtstr.split()[1:])
                    orstr=zh2unicode(f.readline().strip())
                orRecord=(i,ortwid,ortwuid,orcreAt,ortweet,mention,link,orRt,rt)
                orlist.append(orRecord)               
                i=i+1
                print 'tweet: '+ortwid+' has finished!'
        except:
            t=False            
            pass
    it=0
    while (it<len(orlist)):
        try:
            insertlist=orlist[it:it+10000]
            it=it+10000
            try:
                cursor.executemany('INSERT INTO originaltable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',insertlist)
                conn.commit()
            except IntegrityError:
                print "Duplicate entry for key originaltable_sid"
                pass

            print it
        except:
            insertlist=orlist[it:]
            try:
                cursor.executemany('INSERT INTO originaltable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',insertlist)
                conn.commit()
            except IntegrityError:
                print "Duplicate entry for key originaltable_sid"
                pass
    cursor.close()
    conn.close()
Пример #23
0
def retweetInsert(filename,db):  
    f=open(filename,'rb')
    forigin=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/orgin.txt','w')
    fdup=open('/media/M_fM__VM_0M_eM__JM__M_eM__MM_7/tangjie/weibocontents/dupretweet.txt','w')
    t=True
    c=list(['@','retweet','link'])
#    originaldict={}
    conn = MySQLdb.connect(host='localhost',user='******',passwd='20090924',charset='utf8')
    cursor = conn.cursor()
    DB_NAME = 'tangdb'
    conn.select_db(DB_NAME)
    i=1
    m=0
    rtstr=f.readline()
    rtAll=[]
#    dupRt=[]
    while t:
        try:            
            instr=zh2unicode(rtstr.strip())
            if instr!=None:
#                print instr
                ortw=instr.split()                
                ortwid=ortw[0]
                ortwuid=ortw[1]
                timelist=ortw[2].strip().split('-')
                timestr='-'.join(timelist[:-1])+' '+timelist[-1]
                orcreAt=date(timestr)
                orRt=ortw[-1]
                rt=f.readline().strip()
                original=list([ortwid,ortwuid,str(orcreAt),orRt,rt])
                forigin.write('\t'.join(original)+'\n')                
                rwlist=[]
                rtstr=zh2unicode(f.readline().strip())
                for j in range(int(rt)):                    
                    rtlist=rtstr.split()
                    rtuid=rtlist[0]
                    timelist=rtlist[1].strip().split('-')
                    timestr='-'.join(timelist[:-1])+' '+timelist[-1]
                    rtcreAt=date(timestr)
                    rtid=rtlist[-1]
                    rtTw=zh2unicode(f.readline().strip())
                    rtstr=zh2unicode(f.readline().strip())
                    mention=''
                    rtfrom=''
                    link=''
                    while rtstr.split()[0] in c:
                        if rtstr.split()[0]=='@':
                            mention=' '.join(rtstr.split()[1:])
                        if rtstr.split()[0]=='retweet':
                            rtfrom=' '.join(rtstr.split()[1:])
                        if rtstr.split()[0]=='link':
                            link=' '.join(rtstr.split()[1:])
                        rtstr=zh2unicode(f.readline().strip())
                    if rtid in rtAll:
                        rtrecord=(rtid,ortwid,rtuid,str(rtcreAt),rtTw,mention,rtfrom,link)
                        fdup.write('\t'.join(rtrecord)+'\n')
                    else:
                        i=i+1
                        rtrecord=(i,rtid,ortwid,rtuid,str(rtcreAt),rtTw,mention,rtfrom,link)
                        rwlist.append(rtrecord) 
                        rtAll.append(rtid)
                try:
                    cursor.executemany('INSERT INTO retwtable values(%s,%s,%s,%s,%s,%s,%s,%s,%s)',rwlist)
                    conn.commit()
                except IntegrityError:
                    print "Duplicate entry for key retwtable_sid"
                    pass
                m=m+1
                print str(m)+':retweet of '+str(ortwid)+' has finished: '+str(rt)
        except:
            t=False            
            pass
#    pickle.dump(originaldict,forigin)
#    duplist=['\t'.join(r) for r in dupRt]
#    dup='\n'.join(duplist)
#    fdup.write(dup)
    cursor.close()
    conn.close()
    fdup.close()
#    print len(originaldict.keys())
    forigin.close()
    print 'retweet over!'
Пример #24
0
import os, sys
sys.path.insert(0, os.path.join("..", ".."))

from pattern.db import date, time, NOW
from pattern.web import Bing, NEWS

# It is often useful to keep a date stamp for each row in the table.
# The pattern.db module's date() function can be used for this.
# It is a simple wrapper around Python's datetime.datetime class,
# with extra functionality to make it easy to parse or print it as a string.

print date(NOW)
print date()
print date("2010-11-01 16:30", "%Y-%m-%d %H:%M")
print date("Nov 1, 2010", "%b %d, %Y")
print date("Nov 1, 2010", "%b %d, %Y", format="%d/%m/%Y")
print

# All possible formatting options:
# http://docs.python.org/library/time.html#time.strftime

for r in Bing(license=None, language="en").search("today", type=NEWS):
    print r.title
    print repr(
        r.date
    )  # Result.date is a string (e.g. we can't > <= += with the date).
    print date(r.date)  # date() can parse any Result.date in the web module.
    print

d = date("4 november 2011")
d += time(days=2, hours=5)
Пример #25
0
def user2dict(filename):
    f=codecs.open(filename,'rb',encoding='UTF-8')
#    f = codecs.EncodedFile(f,file_encoding = "utf8",data_encoding ="gb2312")
    filelines=f.readlines()
    print ''.join(filelines[:14])
    i=15
    while(i<len(filelines)):
#        print "line"+str(i)
        try:
            
            try:
                sublines=filelines[i:i+15]
            except:
                sublines=filelines[i:]
#            print ''.join(sublines)
#            if sublines[0].strip() in mapid.keys():
#                userdict["mapid"]=mapid[sublines[0].strip()]
#            else:
#                userdict["mapid"]=-1
#            print sublines[0].strip()+' mapto: '+str(userdict["mapid"])
            userid=sublines[0].strip()
            try:
                biFollowersCount=int(sublines[1].strip())
            except:
                biFollowersCount=0
            city=sublines[2].strip()
            verified=sublines[3].strip()
            try:
                followersCount=int(sublines[4].strip())
            except:
                followersCount=0
            location=sublines[5].strip()
            province=sublines[6].strip()
            friendsCount=int(sublines[7].strip())
            name=sublines[8].strip().replace("\\","\\\\")
            gender=sublines[9].strip()
            timelist=sublines[10].strip().split('-')
            timestr='-'.join(timelist[:-1])+' '+timelist[-1]
            createdAt=date(timestr)
            verifiedType=sublines[11].strip()
            try:
                statusesCount=int(sublines[12].strip())
            except:
                statusesCount=0
            description=sublines[13].strip().replace("\\","\\\\")
            i=i+15
            userrec=[]
            userrec.append(userid)
            userrec.append(name)
            userrec.append(province)
            userrec.append(city)
            userrec.append(location)
            userrec.append(description)
            userrec.append(gender)
            userrec.append(followersCount)
            userrec.append(friendsCount)
            userrec.append(statusesCount)
            userrec.append(createdAt)
            userrec.append(verified)
            userrec.append(verifiedType)
            userrec.append(biFollowersCount)
            yield userrec
        except:
            print filename +" error!"
            exit(0)
Пример #26
0
# Python Data Science and Analytics.
# Data Science is a field in computer science that is dedicated to analyzing patterns in raw data using
# techniques like Artificial Intelligence (AI), Machine Learning (ML), mathematical functions, and
# statistical algorithms.
# Pattern is a web mining module for the Python programming language.
# It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser), natural
# language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet), machine learning
# (vector space model, clustering, SVM), network analysis and <canvas> visualization.
# Twitter Opinion Mining, results per day.
# To do this, we need to "bin" the tweets of a politician per day (or per week, month, year) and calculate the
# average sentiment of that day:

from pattern.db import Datasheet, date, avg
from collections import defaultdict

bins = defaultdict(lambda: defaultdict(list))

for politician, party, date, score in Datasheet.load("data.csv"):

    d = date(row[8])
    d = (d.year, d.month, d.day)

    bins[politician][d].append(float(score))

for politician in bins:

    for day in politician:
        bins[politician][day] = avg(bins[politician][day])