예제 #1
0
def run_from_sqlite(start=0,antall=5):
    """ 
    Now this function only targets the links sqlite3 db that have status NULL
    start parameter should then always be 0
    """ 
    con = None
    try:
        lite_con = lite.connect('nrk2013_2.db')    
        lite_cur = lite_con.cursor()    
        # fetch links that do not have a status yet
        lite_sql = 'SELECT * FROM links WHERE status is NULL ORDER BY date DESC LIMIT %s,%s' % (start,antall)
        #print lite_sql
        lite_cur.execute(lite_sql) # 'SELECT SQLITE_VERSION()'        
        rows = lite_cur.fetchall()   #fetchone()

        # sjekk om url er scrapet i MYSQL db'n
        connection, cur = connect()

        # loop through SQLite set
        for row in rows:
            #print row[0], row[1], row[2], row[3] # 1 er url, 2 er tidspunkt for innsamling 
            # check if url is in mysql as url or url_self_link (either is fine)
            # rewrite to only check for primary url (not self_utl)
            mysql_query = "SELECT * FROM page WHERE url = '%s' " % (row[1]) # OR url_self_link = '%s'
            cur.execute(mysql_query)
            mysql_rows = cur.fetchall()
            #print len(mysql_rows), row[1]
            if len(mysql_rows) == 0: # hvis 0 -> sett inn 
                logger.info( "finnes, ikke, må settes inn: %s" % (row[1].encode('utf8')) )
                #print row[1]

                sleep(scraping_request_stagger) # found in settings
                status = create_dictionary(row[1])
                # I aleardy DO commit in rdbms_insertion.py 
                # does this help?
                connection.commit()

                # updata sqlite3 with status 
                logger.info( "status: %s" %  status )

                lite_cur.execute("UPDATE links SET status=? WHERE page=? AND link=?", (status, row[0], row[1]))
                lite_con.commit()
            else:
                # does exist in mysql, but has status NULL in .db 
                # update row in .db 
                lite_cur.execute("UPDATE links SET status=? WHERE page=? AND link=?", ("scraped", row[0], row[1]))
                lite_con.commit()
                logger.info( "fantes allarede i mysql, oppdaterer sqlite3: %s" % (row[1]) )

        lite_con.close()    # close sqlite3
        connection.close()  # close mysql


        
    except lite.Error, e:
        print "Error %s:" % e.args[0]
        sys.exit(1)
예제 #2
0
파일: __init__.py 프로젝트: DSS1314/AWS2
 def ui():  # 发送按钮响应事件
     m = inputUrl.get()
     n = connect_mysql.connect(m)  #编写send1函数将消息发送到后台做相应处理
     print(n)
예제 #3
0
import pandas as pd
import numpy as np
from sqlalchemy.orm import sessionmaker
from connect_mysql import connect

# create a configured "Session"
Session = sessionmaker(bind=connect())


def get_unique_df_values(dataframe, col_name):
    """
    Get unique values from a dataframe
    """
    # Get unique value of a column
    unique_value_list = dataframe[col_name].unique().tolist()

    return unique_value_list


def read_csv(file_name):
    """
    read csv file and map nutrition grade into integer
    """
    df_original = pd.read_csv(file_name)
    df_original = df_original.replace({"-": None})
    df_original = df_original.replace({np.nan: None})
    # Here we map the nutrition grade into integers
    grade = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, None: 6}
    df_original['nutrition_grade_fr'] = [
        grade[item] for item in df_original['nutrition_grade_fr']
    ]
예제 #4
0
def add_to_db(dict):
    # <this is moved to run.py>
    # # create logger with 'tldextract'
    # logger = logging.getLogger('tldextract')
    # logger.setLevel(logging.DEBUG)
    # # create file handler which logs even debug messages
    # fh = logging.FileHandler('spam.log')
    # fh.setLevel(logging.DEBUG)
    # # create console handler with a higher log level
    # ch = logging.StreamHandler()
    # ch.setLevel(logging.ERROR)
    # # create formatter and add it to the handlers
    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # fh.setFormatter(formatter)
    # ch.setFormatter(formatter)
    # # add the handlers to the logger
    # logger.addHandler(fh)
    # logger.addHandler(ch)
    # <added local logger>
    rdbms_logger = logging.getLogger('nrk2013')#nrk2013.rdbms_insertion')

    with open('insertion.sql', 'r') as f:
        insertion = f.read()
    with open('insertion_link.sql', 'r') as f:
        insertion_link = f.read()
    with open('insertion_author.sql', 'r') as f:
        insertion_author = f.read()
    with open('insertion_factbox.sql', 'r') as f:
        insertion_factbox = f.read()


    # connection info in connect_mysql.py
    connection, cur = connect()

    # kommentrer ut try/except her for å finne en feil... 
    #try:
        # Vi må være forsiktige med forfattere, fordi NRK ikke alltid klarer å huske på dem.
        # Se td. http://www.nrk.no/livsstil/test-av-norges-mest-solgte-brod-1.8352163, med tre forfattere.
    for author in dict['authors']:
        authorName = author[0]
        if(authorName):
            authorName = authorName
        else:
            authorName = None
            
        authorMail = author[1]
        if(authorMail):
            authorMail = authorMail
        else:
            authorMail = None
            
        authorRole = author[2]
        if(authorRole):
            authorRole = authorRole
        else:
            authorRole = None

        # if all three are not None
        if not ((authorName is None) & (authorMail is None) & (authorRole is None)):
            #print authorName, authorMail, authorRole
            # insert
            cur.execute(insertion_author,
                        (dict['url'], # dict['url_self_link']
                         authorName,
                         authorMail,
                         authorRole))
            # import sys
            # sys.exit(0)
        # else: 
        #     print authorName, authorMail,authorRole


                    
    for box in dict['factbox']:
        cur.execute(insertion_factbox,
                    (dict['url'], # dict['url_self_link']
                     len(box['links']),
                     box['wordcount'],
                     box['text'].encode('utf-8')))
            
            
    for link in dict['internal_links']:
        extr = tldextract.extract(link)
        cur.execute(insertion_link.encode('utf-8'),
                    (dict['url'].encode('utf-8'), # dict['url_self_link']
                     link.encode('utf-8'),
                     u"html",
                     extr[0].encode('utf-8'),
                     extr[1].encode('utf-8'),
                     extr[2].encode('utf-8'),
                     '1'.encode('utf-8')))
        
    for link in dict['external_links']:
        extr = tldextract.extract(link)
        cur.execute(insertion_link.encode('utf-8'),
                    (dict['url'].encode('utf-8'), # url_self_link
                    link.encode('utf-8'),
                    u"html",
                    extr[0].encode('utf-8'),
                    extr[1].encode('utf-8'),
                    extr[2].encode('utf-8'),
                    '0'.encode('utf-8')))

    published = dict['published']
    if(published != "NULL") :
           published = datetime.fromtimestamp(mktime(dict['published'])).strftime("%Y-%m-%d %H:%M:%S")
    #updated = ""
    timestamp = dict['timestamp'].strftime("%Y-%m-%d %H:%M:%S")
    # print dict['updated']
    # if dict['updated']:
    #     updated = (" ".join(elem for elem in dict['updated'])).replace('.', '-') + ":00"
    # else:
    #     pass
    #     #updated = published
    # print updated


    cur.execute(insertion,
                (dict['url'],
                 dict['url_self_link'],
                 dict['headline'], #.encode('utf-8'),
                 dict['body'], #.encode('utf-8'),
                 published,
                 dict['updated'], #updated,
                 timestamp,
                 dict['fb_like'],
                 dict['fb_share'],
                 dict['googleplus_share'],
                 dict['twitter_share'],
                 dict['others_share'],
                 dict['language'], #.encode('utf-8'),
                 dict['lesbahet'],
                 dict['news_bureau'], #"NA", # Får ikke fatt på nyhetsbyrå enda.
                 len(dict['external_links']),
                 len(dict['internal_links']),
                 dict['word_count'],
                 dict['line_count'],
                 dict['char_count'],
                 len(dict['factbox']),
                 dict['comment_fields'],
                 dict['comment_number'],
                 dict['interactive_elements'], #"interactive_elements IS NOT DONE",
                 dict['poll'], #"NOT DONE",
                 dict['game'], #"NOT DONE",
                 dict['video_files'],
                 dict['video_files_nrk'],
                 dict['flash_file'],
                 dict['image_collection'],
                 dict['images'],
                 dict['image_captions'],# .encode('utf-8'),
                 dict['related_stories'],
                 dict['related_stories_box_thematic'], #"related_stories_box_thematic IS NOT DONE",
                 dict['related_stories_box_les'],           #"related_stories_box_les IS NOT DONE",
                 dict['map'],  # map IS NOT DONE
                 dict['publiseringssted'],
                 dict['programtilknytning'],
                 dict['hovedkategori'],
                 dict['iframe'],
                 dict['css'],
                 dict['js'],
                 dict['template']))

    connection.commit()
    return
    #except:
    #    print "hva?! SLutten av rdbms_insertion.py. hadde ikke ventet å komme hit. noe "
    #    rdbms_logger.error("DB insert feilet!")
예제 #5
0
        self.url = url
        self.nutrition_grade = nutrition_grade
        self.energy = energy
        self.proteins = proteins


class Favorite(Base):
    __tablename__ = 'favorites'

    id = Column(Integer, primary_key=True)
    product_name = Column(String(500))
    bar_code = Column(String(1500))
    url = Column(String(2500))
    nutrition_grade = Column(Integer)
    energy = Column(Float)
    proteins = Column(Float)
    store_name = Column(String(200))

    def __init__(self, product_name, bar_code, url, nutrition_grade, energy,
                 proteins, store_name):
        self.product_name = product_name
        self.bar_code = bar_code
        self.url = url
        self.nutrition_grade = nutrition_grade
        self.energy = energy
        self.proteins = proteins
        self.store_name = store_name


Base.metadata.create_all(connect())
예제 #6
0
def add_to_db(dict):
    # <this is moved to run.py>
    # # create logger with 'tldextract'
    # logger = logging.getLogger('tldextract')
    # logger.setLevel(logging.DEBUG)
    # # create file handler which logs even debug messages
    # fh = logging.FileHandler('spam.log')
    # fh.setLevel(logging.DEBUG)
    # # create console handler with a higher log level
    # ch = logging.StreamHandler()
    # ch.setLevel(logging.ERROR)
    # # create formatter and add it to the handlers
    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # fh.setFormatter(formatter)
    # ch.setFormatter(formatter)
    # # add the handlers to the logger
    # logger.addHandler(fh)
    # logger.addHandler(ch)
    # <added local logger>
    rdbms_logger = logging.getLogger('nrk2013')  #nrk2013.rdbms_insertion')

    with open('insertion.sql', 'r') as f:
        insertion = f.read()
    with open('insertion_link.sql', 'r') as f:
        insertion_link = f.read()
    with open('insertion_author.sql', 'r') as f:
        insertion_author = f.read()
    with open('insertion_factbox.sql', 'r') as f:
        insertion_factbox = f.read()

    # connection info in connect_mysql.py
    connection, cur = connect()

    # kommentrer ut try/except her for å finne en feil...
    #try:
    # Vi må være forsiktige med forfattere, fordi NRK ikke alltid klarer å huske på dem.
    # Se td. http://www.nrk.no/livsstil/test-av-norges-mest-solgte-brod-1.8352163, med tre forfattere.
    for author in dict['authors']:
        authorName = author[0]
        if (authorName):
            authorName = authorName
        else:
            authorName = None

        authorMail = author[1]
        if (authorMail):
            authorMail = authorMail
        else:
            authorMail = None

        authorRole = author[2]
        if (authorRole):
            authorRole = authorRole
        else:
            authorRole = None

        # if all three are not None
        if not ((authorName is None) & (authorMail is None) &
                (authorRole is None)):
            #print authorName, authorMail, authorRole
            # insert
            cur.execute(
                insertion_author,
                (
                    dict['url'],  # dict['url_self_link']
                    authorName,
                    authorMail,
                    authorRole))
            # import sys
            # sys.exit(0)
        # else:
        #     print authorName, authorMail,authorRole

    for box in dict['factbox']:
        cur.execute(
            insertion_factbox,
            (
                dict['url'],  # dict['url_self_link']
                len(box['links']),
                box['wordcount'],
                box['text'].encode('utf-8')))

    for link in dict['internal_links']:
        extr = tldextract.extract(link)
        cur.execute(
            insertion_link.encode('utf-8'),
            (
                dict['url'].encode('utf-8'),  # dict['url_self_link']
                link.encode('utf-8'),
                u"html",
                extr[0].encode('utf-8'),
                extr[1].encode('utf-8'),
                extr[2].encode('utf-8'),
                '1'.encode('utf-8')))

    for link in dict['external_links']:
        extr = tldextract.extract(link)
        cur.execute(
            insertion_link.encode('utf-8'),
            (
                dict['url'].encode('utf-8'),  # url_self_link
                link.encode('utf-8'),
                u"html",
                extr[0].encode('utf-8'),
                extr[1].encode('utf-8'),
                extr[2].encode('utf-8'),
                '0'.encode('utf-8')))

    published = dict['published']
    if (published != "NULL"):
        published = datetime.fromtimestamp(mktime(
            dict['published'])).strftime("%Y-%m-%d %H:%M:%S")
    #updated = ""
    timestamp = dict['timestamp'].strftime("%Y-%m-%d %H:%M:%S")
    # print dict['updated']
    # if dict['updated']:
    #     updated = (" ".join(elem for elem in dict['updated'])).replace('.', '-') + ":00"
    # else:
    #     pass
    #     #updated = published
    # print updated

    cur.execute(
        insertion,
        (
            dict['url'],
            dict['url_self_link'],
            dict['headline'],  #.encode('utf-8'),
            dict['body'],  #.encode('utf-8'),
            published,
            dict['updated'],  #updated,
            timestamp,
            dict['fb_like'],
            dict['fb_share'],
            dict['googleplus_share'],
            dict['twitter_share'],
            dict['others_share'],
            dict['language'],  #.encode('utf-8'),
            dict['lesbahet'],
            dict['news_bureau'],  #"NA", # Får ikke fatt på nyhetsbyrå enda.
            len(dict['external_links']),
            len(dict['internal_links']),
            dict['word_count'],
            dict['line_count'],
            dict['char_count'],
            len(dict['factbox']),
            dict['comment_fields'],
            dict['comment_number'],
            dict['interactive_elements'],  #"interactive_elements IS NOT DONE",
            dict['poll'],  #"NOT DONE",
            dict['game'],  #"NOT DONE",
            dict['video_files'],
            dict['video_files_nrk'],
            dict['flash_file'],
            dict['image_collection'],
            dict['images'],
            dict['image_captions'],  # .encode('utf-8'),
            dict['related_stories'],
            dict[
                'related_stories_box_thematic'],  #"related_stories_box_thematic IS NOT DONE",
            dict[
                'related_stories_box_les'],  #"related_stories_box_les IS NOT DONE",
            dict['map'],  # map IS NOT DONE
            dict['publiseringssted'],
            dict['programtilknytning'],
            dict['hovedkategori'],
            dict['iframe'],
            dict['css'],
            dict['js'],
            dict['template']))

    connection.commit()
    return