def _extract_data2(self, doc_str):
        doc = Pq(doc_str)
        a_list = doc(".place>ul>li>a")
        try:
            self._comcode_detail["province"] = doc(a_list[1]).text()
            self._comcode_detail["city"] = doc(a_list[2]).text()
        except IndexError as er:
            sql = "   UPDATE  fetch_list2 SET  times = 0  WHERE url = '{}'".format(
                self._now_url)
            Dao.execute_dmls(sql)
        doc = Pq(doc_str)
        self._comcode_detail["area"] = doc('.content>ul>li>h1').text()
        doc = Pq(doc_str)
        tr_list = doc('.content>table>tr')
        for tr in tr_list:
            try:
                # time.sleep(1)
                td_list = doc(tr).find("td")
                self._comcode_detail["street"] = doc(
                    td_list[0]).find("a").text()
                a_list = doc(td_list[1]).find("a")
                for a in a_list:
                    self._comcode_detail["society_community"] = doc(a).text()
                    self._save_comcode()

            except IndexError as er:
                print("error in " + doc(tr).text())
def save_apartments(COMMUNITY_ID, BUILDING_NUM, URL):
    # URL = 'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoHouseShow.aspx?PBTAB_ID=YFW003120_MD003&SPJ_ID=a5121bf5-f3af-451d-9e6c-01b1e33b2f7b'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
        'Referer':
        'http://www.szfcweb.com/szfcweb/(S(knmrwg452ea0mu55p2f5zi45))/DataSerach/SaleInfoProListIndex.aspx'
    }
    dao = Dao()
    request = urllib.request.Request(url=URL)
    m_fp = urllib.request.urlopen(request, timeout=500)
    html_str = m_fp.read().decode("utf8")
    doc = Pq(html_str)
    try:
        table = doc("table.table_xkb")
        td_list = doc(table).find("div.lfzt>a")
        for td in td_list:
            APARTMENT_NUM = doc(td).text()
            insertSQL = "INSERT INTO  apartments (COMMUNITY_ID , BUILDING_NUM , APARTMENT_NUM ,STATUS ,create_time  )" \
                        " VALUES ('{}','{}','{}','{}','{}'  )".format(COMMUNITY_ID, BUILDING_NUM, APARTMENT_NUM, 2,
                                                                      time.strftime('%Y-%m-%d %H:%M:%S',
                                                                                    time.localtime(time.time())))

            dao.execute_dmls(insertSQL)
    except Exception:
        print(Exception)
    update_sql = "update ehdc.buildings set status=2 where url = {} ;".format(
        URL)
Пример #3
0
    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE   source_id ='{}' and status<2 ; ".format(
            self.source_id)
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, ORIGINAL_URL in result:
            try:
                self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
                self._apartment_detail["create_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # print("_generate_seed_url func : "+ORIGINAL_URL)
                self._visit_pages(ORIGINAL_URL)
                sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
            except Exception as e:
                print(e)
                sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
 def _save_comcode(self):
     inser_sql = "INSERT INTO comcode (province ,city,area,street,society_community  )" \
                 " VALUES ('{}','{}','{}','{}','{}'  )".format(self._comcode_detail["province"],
                                                               self._comcode_detail["city"],
                                                               self._comcode_detail["area"],
                                                               self._comcode_detail["street"],
                                                               self._comcode_detail["society_community"])
     Dao.execute_dmls(inser_sql)
 def _save_community(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id = 18 ".format(self._community_detail["url"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(self._community_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_community())
 def _generate_seed_url(self):
     """
     generate all url to visit
     """
     # 从数据库添加
     # from page 1 to anypage which < 200
     global Dao
     Dao = Dao()
     self._seed_url = Dao._get_url_by_id(self.source_id)
def get_communities():
    dao = Dao()
    sql = '''SELECT DISTINCT
 COMMUNITY_ID,
  BUILDING_NUM,
  URL
FROM ehdc.buildings  WHERE STATUS = 0 LIMIT 1,100; '''
    result = dao.execute_query(sql)
    return result
def getmerchants(lat, long, COMMUNITY_ID):
    dao = Dao()
    param = {
        'long': long,
        'lat': lat,
        'cat': '',
        'page': '1',
        'order': '1',
        'ondoor': '0',
        'type': 'nine'
    }
    cats = ['街道办', "居委会", '入学']
    for cat in cats:
        param['cat'] = cat
        url = "http://llzg.com/llzgmri/m/p/business/list?" + urllib.parse.urlencode(
            param)

        r = urllib.request.urlopen(url)
        rlt = json.loads(r.read().decode('UTF-8'))
        try:
            for merchant in rlt['business']:
                insertSql = '''INSERT INTO ehdc.merchant_llzg
                            (city_id,
                             NAME,
                             phone,
                             area_name,
                             location,
                             description,
                             url,
                             LONGITUDE,
                             LATITUDE,
                             source_id,
                             service,
                             display_name,logo,COMMUNITY_ID)
                VALUES ('',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '{}');'''.format(merchant['business_name'],
                                         merchant['phone_number'],
                                         merchant['address'], '',
                                         merchant['lat'], merchant['long'],
                                         merchant['sub_title'], param['cat'],
                                         merchant['logo'], COMMUNITY_ID)
                dao.execute_dmls(insertSql)
        except:
            pass
Пример #9
0
 def _save_merchant(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM merchant WHERE url = '{}'".format(
         self._merchant_detail["url"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self._merchant_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_merchant())
Пример #10
0
def convert_to_db(filename, db_filename):
    click.echo("Converting \"{}\" into a database \"{}\"".format(
        filename, db_filename))

    worksheet = get_spreadsheet(filename)

    click.echo("Creating database file...")
    new_dao = Dao(db_filename)
    conn = new_dao.create_connection()

    if conn is not None:
        table_name = raw_input("Give name for the database table: ")
        click.echo()

        columns = get_headers(worksheet)
        values = get_values(worksheet)

        new_dao.create_table(table_name)

        for column in columns:
            new_dao.create_column(table_name, column)

        column_titles = ', '.join(columns)

        new_dao.insert_values(table_name, column_titles, values)

    conn.close()

    click.echo("Database '{}' created.".format(db_filename))
Пример #11
0
    def mining(self, stamp_start, stamp_finish):
        stamp_start, stamp_finish = int(stamp_start) + 1, int(stamp_finish)
        cont = 5
        pivo = 5
        options = webdriver.ChromeOptions()
        options.add_argument('headless')

        self._driver = webdriver.Chrome()
        while stamp_start < stamp_finish:

            url = "https://twitter.com/search?f=tweets&vertical=default&q=%23{}%20since%3A{}%20until%3A{}&l=pt&src=typd".format(
                self._hastag, stamp_start, stamp_start + self.STEPP_TIMESTAMP)
            driver = self._driver
            driver.get(url)
            assert "since" in driver.title
            elementList = driver.find_elements_by_class_name("js-stream-tweet")
            lasttSizeList = len(elementList)

            while True:
                body = driver.find_element_by_tag_name('body')
                body.send_keys(Keys.END)
                time.sleep(2)
                elementList = driver.find_elements_by_class_name(
                    "js-stream-tweet")
                sizeList = len(elementList)

                if lasttSizeList == sizeList:
                    break
                elif sizeList > 100:
                    break
                lasttSizeList = len(elementList)

            stringList = []

            for tweet in reversed(elementList):
                idTweet = tweet.get_attribute("data-tweet-id")
                tweetTimeStamp = tweet.find_elements_by_class_name(
                    "js-short-timestamp")[0].get_attribute("data-time")
                stringList.append(idTweet + ' ' + tweetTimeStamp + '\n')
                dao = Dao()
                dao.insert(
                    'manager',
                    ['hastag', 'idTweet', 'idCandidato', 'timeStamp'],
                    [self._hastag, idTweet, self._candidato, tweetTimeStamp])

            print('...')
            cont += 1
            if cont > pivo:
                pivo += 5
                self.get_status(stamp_start)

            stamp_start = stamp_start + self.STEPP_TIMESTAMP + 1

        self._driver.close()
        self._driver.quit()
 def _save_community(self):
     # 表中是否已有记录
     query_sql = "SELECT * FROM ehdc.communities_llzg WHERE NAME = '{}' and AREA_NAME='{}' ".format(
         self._community_detail["name"],
         self._community_detail["area_name"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(
             self._community_detail["name"]))
         return
     # 数据插入操作
     Dao.execute_dmls(self._insert_community())
Пример #13
0
    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        for single_url in seed_url:
            update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id = 16".format(
                single_url[0])
            Dao.execute_dmls(update_sql)
            self._now_url = single_url[0]
            html = self.get_page_content_str(single_url[0])
            self._extract_data(html)
 def __init__(self,
              LT1,
              LG1,
              LT2,
              LG2,
              cityname,
              cityid,
              cityenname,
              name,
              shopId=0,
              categoryId=0):
     threading.Thread.__init__(self, name=name)
     self.cityid = cityid
     self.shopId = shopId
     self.categoryId = categoryId
     self.Lat1 = LT1
     self.Lat2 = LT2
     self.Long1 = LG1
     self.Long2 = LG2
     self.city_name = cityname
     self.values = {
         'promoId': '0',
         'shopType': '',
         'categoryId': '',
         'sortMode': '2',
         'shopSortItem': '1',
         'keyword': '',
         'searchType': '1',
         'branchGroupId': '0',
         'shippingTypeFilterValue': '0',
         'page': '1'
     }
     self.values["cityId"] = cityid
     self.values["cityEnName"] = cityenname
     self.url = "http://www.dianping.com/search/map/ajax/json"
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
         'Referer':
         'http://www.dianping.com/search/map/category/{}/0'.format(cityid)
     }
     self.dao = Dao()
     self.query_sql = "SELECT shopType ,categoryId,NAME FROM category  WHERE categoryId <> shopType AND categoryId <>'None'  "
     self.result = self.dao.execute_query(self.query_sql)
     self.query_sql2 = "SELECT shopId FROM shop_bean where  city_name ='{}'".format(
         cityname)
     self.result2 = self.dao.execute_query(self.query_sql2)
     self.shopIds = []
     if self.result2 is not None:
         for shopid in self.result2:
             self.shopIds.append(shopid[0])
    def get_page_content_str(self, url):
        time.sleep(1)

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=500)
            html_str = m_fp.read().decode('utf-8')
            m_fp.close()
            return html_str
        except urllib.error.URLError as err:
            # logfile = open('test.log', 'a')
            # logfile.write("Error: {} \n in  url : {}".format(err, url))
            # logfile.close()
            # print("error in {}.get_page_content_str".format(__name__))
            sql = "   UPDATE  fetch_list SET  times = 0  WHERE url = '{}'".format(
                self._now_url)
            dao = Dao()
            dao.execute_dmls(sql)
            # if url[-3:] == "htm":
            # time.sleep(120)
            #     return self.get_page_content_str(url)
            return None
        except Exception as err:
            print(err)
            sql = "   UPDATE  fetch_list SET  times = 0  WHERE url = '{}'".format(
                self._now_url)
            dao = Dao()
            dao.execute_dmls(sql)
            return None
Пример #16
0
    def start(self):

        if self._print_status:
            print('start mining #{} ...'.format(self._hastag))
        dao = Dao()
        timeStamp_tweet_list = dao.select(
            '*', "manager",
            "hastag = '{}' ORDER BY 'timeStamp'".format(self._hastag))
        lastTweetTimeStamp = int(timeStamp_tweet_list[-1]['timeStamp'])

        if lastTweetTimeStamp > self.START_TIMESTAMP:
            self.mining(lastTweetTimeStamp, self.FINISH_TIMESTAMP)

        else:
            self.mining(self.START_TIMESTAMP, self.FINISH_TIMESTAMP)
 def _save_apartment(self, apartment_detail):
     # 表中是否已有记录
     query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and FLOOR_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
         int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"],
         apartment_detail["APARTMENT_NUM"])
     if Dao.execute_query(query_sql) is not None:
         print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) +
                                                       apartment_detail["FLOOR_NUM"] +
                                                       apartment_detail["APARTMENT_NUM"]))
         return
     # 数据插入操作
     try:
         Dao.execute_dmls(self._insert_apartment(apartment_detail))
     except Exception as e:
         print(e)
 def _check_community(self, url):
     # 表中是否已有记录 完成的
     communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format(
         url, self.source_id)
     result = Dao.execute_query(communityid_sql)
     if result == None:
         return False
     return True
    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        for single_url in seed_url:
            update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
                single_url[0])
            Dao.execute_dmls(update_sql)
            self._base_url = single_url[0]
            self._now_url = single_url[0]
            html = self.get_page_content_str(single_url[0])
            try:
                self._extract_data(html)
            except Exception as e:
                print(e)
                update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
                    single_url[0])
                Dao.execute_dmls(update_sql)
 def execute(self, COMMUNITY_ID, BUILDING_NUM, URL):
     try:
         apartment_detail = {
             'COMMUNITY_ID': 0,
             'BUILDING_NUM': '',
             'APARTMENT_NUM': '',
             'STATUS': '2',
             'create_time': ''
         }
         apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
         apartment_detail["BUILDING_NUM"] = BUILDING_NUM
         apartment_detail["create_time"] = time.strftime(
             '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         self._visit_pages(URL, apartment_detail)
         sql = "update BUILDINGS set status = '2' where URL = '{}' ; ".format(
             URL)
         Dao.execute_dmls(sql)
     except Exception as e:
         print(e)
         sql = "update BUILDINGS set status = -1 where URL = '{}' ; ".format(
             URL)
         Dao.execute_dmls(sql)
    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        for single_url in seed_url:
            # # 获取html源代码
            # html = self.get_page_content_str(single_url)
            #
            # #使用哪个方法进行分析
            # self._extract_data(html)

            # dao=Dao()
            # insert_sql =" INSERT INTO fetch_list (source_id, url,times,page,STATUS) VALUE(99,'{}',0,0,0)".format(single_url)
            # dao.execute_dmls(insert_sql)

            dao = Dao()
            update_sql = "   UPDATE  fetch_list2 SET  times = times+1 WHERE url = '{}'and source_id = 98 ".format(
                single_url[0])
            dao.execute_dmls(update_sql)
            self._now_url = single_url[0]
            html = self.get_page_content_str(single_url[0])
            self._extract_data2(html)
    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        self._comcode_detail["province"] = doc('.content>ul>li>h1').text()
        doc = Pq(doc_str)
        tr_list = doc('.content>table>tr')

        for tr in tr_list:
            try:
                # time.sleep(1)
                td_list = doc(tr).find("td")
                self._comcode_detail["city"] = doc(td_list[0]).find("a").text()
                a_list = doc(td_list[1]).find("a")
                for a in a_list:
                    self._comcode_detail["area"] = doc(a).text()
                    url = self._base_url + doc(a).attr("href")
                    # html = self.get_page_content_str(url)
                    # self._extract_data2(html)
                    insert_sql = " INSERT INTO fetch_list2 (source_id, url,times,page,STATUS) VALUE(98,'{}',0,0,0)".format(
                        url)
                    print("insert sql is [" + insert_sql)
                    Dao.execute_dmls(insert_sql)
            except IndexError as er:
                print("error in " + doc(tr).text())
    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT  COMMUNITY_ID, BUILDING_NUM, URL  FROM ehdc.buildings WHERE STATUS = 0  ; "
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, BUILDING_NUM, URL in result:
            self.execute(COMMUNITY_ID, BUILDING_NUM, URL)
 def _extract_data(self, url):
     community_id = self._save_community()
     doc_str = self.get_page_content_str(url)
     doc = Pq(doc_str)
     tr_list = doc("table>tr")
     try:
         for tr in tr_list:
             Floor_num = Pq(tr)("td:eq(0)").text()
             a_list = doc(tr).find("td.preview>a")
             for a in a_list:
                 apartment_detail = {
                     'COMMUNITY_ID': community_id,
                     'FLOOR_NUM': Floor_num,
                     'APARTMENT_NUM': doc(a).text(),
                     'STATUS': '2',
                     'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 }
                 self._save_apartment(apartment_detail)
         sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url)
         Dao.execute_dmls(sql)
     except Exception as  e:
         print(e)
         sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url)
         Dao.execute_dmls(sql)
Пример #25
0
 def __init__(self):
     self._conn = sqlite3.connect('database.db')
     self.vaccines = Dao(DTO.Vaccine, self._conn)
     self.suppliers = Dao(DTO.Supplier, self._conn)
     self.clinics = Dao(DTO.Clinic, self._conn)
     self.logistics = Dao(DTO.Logistic, self._conn)
Пример #26
0
class CommunitiesListCrawler(BaseCrawler, threading.Thread):
    global Dao
    Dao = Dao()

    def __init__(self):

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 31
        self._base_url = "http://www.tywsfdc.com/"
        self._root_url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBList.do?pid"
        self._apartment_detail = {
            'COMMUNITY_ID': 0,
            'BUILDING_NUM': '',
            'APARTMENT_NUM': '',
            'STATUS': '2',
            'create_time': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # 单个url
        # html = self.get_page_content_str(self._seed_url[0]) #用数据库的时候
        self._pid = seed_url[seed_url.rindex("-"):]
        seed_url = self._root_url + "=" + self._pid
        # print("_visit_pages " + seed_url)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
            'Referer': seed_url
        }
        values = {'pid': self._pid, 'pageNo': '1', 'pageSize': '50'}
        data = urllib.parse.urlencode(values).encode(encoding='UTF8')
        request = urllib.request.Request(
            url="http://www.tywsfdc.com/Firsthand/tyfc/publish/ProNBList.do",
            headers=headers,
            data=data)

        m_fp = urllib.request.urlopen(request, timeout=500)
        html_str = m_fp.read().decode("utf8")
        self.findEachBuilding(html_str)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def get_page_content_str(self, url):

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=1500)
            html_str_uncode = m_fp.read()
            if html_str_uncode == '':
                print("出问题了,没出来数据")
                return self.get_page_content_str(url)
            m_fp.close()
            return html_str_uncode
        except urllib.error.URLError as err:
            return None
        except Exception as err:
            print(err)
            return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """
        # self._seed_url = "http://www.njhouse.com.cn/spf/inf/index.php?prjid=108510"
        # self._visit_pages(self._seed_url)
        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        querysql = "SELECT COMMUNITY_ID,ORIGINAL_URL FROM communities WHERE   source_id ='{}' and status<2 ; ".format(
            self.source_id)
        result = Dao.execute_query(querysql)
        for COMMUNITY_ID, ORIGINAL_URL in result:
            try:
                self._apartment_detail["COMMUNITY_ID"] = int(COMMUNITY_ID)
                self._apartment_detail["create_time"] = time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # print("_generate_seed_url func : "+ORIGINAL_URL)
                self._visit_pages(ORIGINAL_URL)
                sql = "update communities set status = '2' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)
            except Exception as e:
                print(e)
                sql = "update communities set status = '-1' where COMMUNITY_ID = '{}' ".format(
                    int(COMMUNITY_ID))
                Dao.execute_dmls(sql)

                # 直接加,测试
                # self._seed_url.append(self._base_url)

    def findEachBuilding(self, html):
        doc = Pq(html)
        tr_list = doc("table>tr")
        # print("tr size ")
        for tr in tr_list:
            try:
                # 进入每一栋的url
                objid = doc(tr).attr("objid")
                if objid == None:
                    continue
                self._apartment_detail["BUILDING_NUM"] = Pq(tr)(
                    "td:eq(2)").text()
                url = "http://www.tywsfdc.com/Firsthand/tyfc/publish/p/ProNBView.do?proPID={}&nbid={}".format(
                    self._pid, objid)
                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
                    'Referer': url
                }
                RequestURL = "http://www.tywsfdc.com/Firsthand/tyfc/publish/probld/NBView.do?nid={}&projectid={}".format(
                    objid, self._pid)
                values = {'nid': objid, 'projectid': self._pid}
                data = urllib.parse.urlencode(values).encode(encoding='UTF8')
                request = urllib.request.Request(url=RequestURL,
                                                 headers=headers,
                                                 data=data)

                m_fp = urllib.request.urlopen(request, timeout=500)
                html_str = m_fp.read().decode("utf8")
                self._extract_data(html_str)
            except Exception as e:
                print(e)
                pass

    def _extract_data(self, doc_str):
        try:
            doc = Pq(doc_str)
            # 每一单元
            building_list = doc("ul#bldlist>span")
            for building in building_list:
                bld = doc(building).attr("id")
                bld = bld[3:]
                self._apartment_detail["BUILDING_NUM"] = doc(building).text()
                # 每一层:
                xpath = "div.flrlist>table#{}>tr".format(bld)
                tr_list = doc(xpath)
                # total_item =int( doc("").text().strip())
                # count_num = int(total_item) / 12
                for tr in tr_list:
                    self._apartment_detail["FLOOR_NUM"] = Pq(tr)(
                        "td:eq(0)").text()
                    a_list = Pq(tr)("td:eq(1)>span>a")
                    for a in a_list:
                        self._apartment_detail["APARTMENT_NUM"] = doc(a).text()
                        if self._apartment_detail["APARTMENT_NUM"].strip(
                        ) != '':
                            self._apartment_detail[
                                "create_time"] = time.strftime(
                                    '%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
                            self._save_apartments()
        except Exception as err:
            print(err)
            time.sleep(100)
            self._extract_data(doc_str)

    def _insert_community(self):
        result = "INSERT INTO  apartments (COMMUNITY_ID , BUILDING_NUM ,FLOOR_NUM , APARTMENT_NUM ,STATUS ,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}','{}'  )".format(self._apartment_detail["COMMUNITY_ID"],
                                                                    self._apartment_detail["BUILDING_NUM"],
                                                                    self._apartment_detail["FLOOR_NUM"],
                                                                    self._apartment_detail["APARTMENT_NUM"],
                                                                    self._apartment_detail["STATUS"],
                                                                    self._apartment_detail["create_time"])
        return result

    def _save_apartments(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and BUILDING_NUM ='{}'  and APARTMENT_NUM ='{}'and FLOOR_NUM='{}' ; ".format(
            int(self._apartment_detail["COMMUNITY_ID"]),
            self._apartment_detail["BUILDING_NUM"],
            self._apartment_detail["APARTMENT_NUM"],
            self._apartment_detail["FLOOR_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                str(self._apartment_detail["COMMUNITY_ID"]) +
                self._apartment_detail["BUILDING_NUM"] +
                self._apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_community())
        except Exception as e:
            print(e)

    def craw(self):
        self._generate_seed_url()
class CommunitiesListCrawler(BaseCrawler):
    global Dao
    Dao = Dao()

    def __init__(self):
        # TODO 用参数化和多线程来执行抓取

        super().__init__()
        self.detail_info_urls = []
        self.source_id = 21
        self._base_url = "http://www.njhouse.com.cn/persalereg.php"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """

        # for single_url in seed_url:
        #     update_sql = "   UPDATE  fetch_list SET  times = times+1 WHERE url = '{}'and source_id =17".format(
        #         single_url[0])
        #     Dao.execute_dmls(update_sql)
        #     self._base_url = single_url[0]
        #     self._now_url = single_url[0]
        #     html = self.get_page_content_str(single_url[0])
        #     try:
        #         self._extract_data(html)
        #     except Exception as e:
        #         print(e)
        #         update_sql = "   UPDATE  fetch_list SET  status  = 1 WHERE url = '{}'and source_id =17".format(
        #             single_url[0])
        #         Dao.execute_dmls(update_sql)

        # 单个url
        html = self.get_page_content_str(self._seed_url[0])
        self._extract_data(html)
        # b = set(self._resualt)
        # self._resualt=[i for  i in b]
        # # dao=Dao()
        # insert_sql=""
        # for res1 in b :
        # insert_sql = "INSERT INTO merchant_tmp (description,url )VALUES ( '{}', 'http://www.youlin.me/category/407')".format(res1)
        # print( insert_sql  )
        # dao = Dao()
        # dao.execute_dmls(insert_sql)

    def get_page_content_str(self, url):
        time.sleep(1)

        try:
            print("现在开始抓取" + url)
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
            }
            request = urllib.request.Request(url=url, headers=headers)
            m_fp = urllib.request.urlopen(request, timeout=5500)
            html_str_uncode = m_fp.read()
            m_fp.close()
            return html_str_uncode
        except urllib.error.URLError as err:
            # logfile = open('test.log', 'a')
            # logfile.write("Error: {} \n in  url : {}".format(err, url))
            # logfile.close()
            # print("error in {}.get_page_content_str".format(__name__))
            # if url[-3:] == "htm":
            # time.sleep(120)
            #     return self.get_page_content_str(url)
            return None
        except Exception as err:
            print(err)
            return None

    def _generate_seed_url(self):
        """
        generate all url to visit
        """

        # from page 1 to anypage which < 200

        # # 从数据库添加
        # self._seed_url = Dao._get_url_by_id(self.source_id)

        # 直接加,测试
        self._seed_url.append(self._base_url)

    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        tables = doc("table>tr>td>table")
        # total_item =int( doc("").text().strip())
        # count_num = int(total_item) / 12
        for table in tables:
            try:
                doc = Pq(table)
                # test =  doc(doc("tr")[1]).find("td")[1].text()
                self._community_detail['location'] = Pq(
                    doc("tr:eq(1)"))("td:eq(1)").text()
                self._community_detail['name'] = Pq(
                    doc("tr:eq(2)"))("a").text()
                self._community_detail['url'] = Pq(
                    doc("tr:eq(2)"))("a").attr("href")
                self._community_detail['area_name'] = Pq(
                    doc("tr:eq(8)"))("td:eq(1)").text()

                self._save_community()
            except Exception as err:
                print(table)
                print(err)
                continue

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,location,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                    self._community_detail["name"],
                                                                    self._community_detail["area_name"],
                                                                    self._community_detail["latitude"],
                                                                    self._community_detail["longitude"],
                                                                    self._community_detail["location"],
                                                                    self.source_id)
        return result

    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(
                self._community_detail["name"]))
            return
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
                             display_name,logo,COMMUNITY_ID)
                VALUES ('',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '',
                        '{}',
                        '{}',
                        '{}',
                        '{}');'''.format(merchant['business_name'],
                                         merchant['phone_number'],
                                         merchant['address'], '',
                                         merchant['lat'], merchant['long'],
                                         merchant['sub_title'], param['cat'],
                                         merchant['logo'], COMMUNITY_ID)
                dao.execute_dmls(insertSql)
        except:
            pass


if __name__ == '__main__':
    querySql = 'SELECT a.name ,a.COMMUNITY_ID ,b.LATITUDE,b.LONGITUDE FROM  shengchan_20140815.communities a ,shengchan_20140815.community_poses b WHERE a.COMMUNITY_ID =b.COMMUNITY_ID AND a.AREA_ID <11 AND a.AREA_ID > 0 '
    dao = Dao()
    result = dao.execute_query(querySql)
    for name, COMMUNITY_ID, LATITUDE, LONGITUDE in result:
        getmerchants(LATITUDE, LONGITUDE, COMMUNITY_ID)
Пример #29
0
class _Repository:
    def __init__(self):
        self._conn = sqlite3.connect('database.db')
        self.vaccines = Dao(DTO.Vaccine, self._conn)
        self.suppliers = Dao(DTO.Supplier, self._conn)
        self.clinics = Dao(DTO.Clinic, self._conn)
        self.logistics = Dao(DTO.Logistic, self._conn)

    def close(self):
        self._conn.commit()
        self._conn.close()

    def create_tables(self):
        self._conn.executescript("""
        CREATE TABLE IF NOT EXISTS logistics (
             id INTEGER PRIMARY KEY,
             name TEXT NOT NULL,
             count_sent INTEGER NOT NULL ,
             count_received INTEGER NOT NULL
        );
        CREATE TABLE IF NOT EXISTS suppliers (
            id INTEGER PRIMARY KEY,
            name TEXT NOT NULL,
            logistic INTEGER REFERENCES logistics(id)
        );

        CREATE TABLE IF NOT EXISTS clinics (
            id INTEGER PRIMARY KEY,
            location TEXT NOT NULL,
            demand INTEGER NOT NULL ,
            logistic INTEGER REFERENCES logistics(id)
        );
        CREATE TABLE IF NOT EXISTS vaccines (
            id INTEGER PRIMARY KEY,
            date DATE NOT NULL,
            supplier INTEGER REFERENCES suppliers(id),
            quantity INTEGER NOT NULL
        );
        """)

    def receiveShipment(self, nameOfSup, amount, date):
        # insert the next vaccine to the vaccine table
        # get the id of the logistics from the suppliers table using the name

        supplier = self.suppliers.find(name=nameOfSup)
        supplierIndex = supplier[0].id
        # get the id of the last inserted line to create a new id for the new vaccine
        lastId = self.vaccines.getLastInsertedId()
        newId = lastId[0] + 1
        newVaccine = DTO.Vaccine(newId, date, supplierIndex, amount)
        self.vaccines.insert(newVaccine)

        idOfLogistics = supplier[0].logistic

        # update the count_received of this logistics company in logistics table
        logistic = self.logistics.find(id=idOfLogistics)
        currCountRec = logistic[0].count_Received
        set_value = {'count_received': currCountRec + int(amount)}

        # only where the id = idOfLogistics we got from the find query
        cond = {'id': idOfLogistics}
        self.logistics.update(set_value, cond)

    def sendShipment(self, locationOfClinic, amount):
        clinic = self.clinics.find(location=locationOfClinic)
        # get the id of the logistic of this clinic

        idOfLogistics = clinic[0].logistic
        # update the count_sent of this logistics company in logistics table
        logistic = self.logistics.find(id=idOfLogistics)
        currCountSent = logistic[0].count_Sent

        set_value = {'count_sent': currCountSent + int(amount)}

        # only where the id = idOfLogistics we got from the find query
        cond = {"id": idOfLogistics}
        self.logistics.update(set_value, cond)
        # remove amount from inventory
        allVaccines = self.vaccines.findWithASCOrder('date')
        tempAmount = int(amount)
        for vaccine in allVaccines:
            if tempAmount == 0:
                break
            # we need to delete the line since the quantity will be zero

            if vaccine.quantity <= int(tempAmount):
                self.vaccines.delete(id=vaccine.id)
                tempAmount = tempAmount - int(vaccine.quantity)
            # if we can take amount and not delete

            else:
                set_value = {'quantity': vaccine.quantity - int(tempAmount)}
                cond = {"id": vaccine.id}
                self.vaccines.update(set_value, cond)
                tempAmount = 0

        # remove amount from the demand of location

        currDemand = clinic[0].demand

        set_value = {"demand": currDemand - int(amount)}
        cond = {"location": locationOfClinic}
        self.clinics.update(set_value, cond)
class CommunitiesListCrawler(BaseCrawler, threading.Thread):
    global Dao
    Dao = Dao()

    def __init__(self, page_num):
        # TO
        threading.Thread.__init__(self, name=page_num)
        super().__init__(self)
        self.detail_info_urls = []
        self.source_id = 30
        self.min_page = page_num * 30 + 1
        self.max_page = page_num * 30 + 31
        self._base_url = "http://newhouse.hfhome.cn/"
        self._community_detail = {
            'url': '',
            'name': '',
            'location': '',
            'area_name': '',
            'description': '',
            'latitude': '',
            'longitude': ''
        }

    def _visit_pages(self, seed_url):
        """
        visit one url,get page content
        """
        # 单个url
        html = self.get_page_content_str(seed_url)
        self.findEachBuilding(html)


    def findEachBuilding(self, html):
        doc = Pq(html)
        tr_list = doc("table#GVFwxkz>tr")
        for tr in tr_list:
            name = Pq(tr)("td:eq(1)").text()
            self._community_detail["name"] = name
            href = doc(tr).find("td>a").attr("href")
            if href == None:
                continue
            href = href[href.index("?"):]
            url = "http://newhouse.hfhome.cn/Modal/RoomList.aspx" + href
            if self._check_community(url):
                print(url + "     ---    已经爬取过了")
                continue
            self._community_detail["url"] = url
            self._extract_data(url)

    def _extract_data(self, url):
        community_id = self._save_community()
        doc_str = self.get_page_content_str(url)
        doc = Pq(doc_str)
        tr_list = doc("table>tr")
        try:
            for tr in tr_list:
                Floor_num = Pq(tr)("td:eq(0)").text()
                a_list = doc(tr).find("td.preview>a")
                for a in a_list:
                    apartment_detail = {
                        'COMMUNITY_ID': community_id,
                        'FLOOR_NUM': Floor_num,
                        'APARTMENT_NUM': doc(a).text(),
                        'STATUS': '2',
                        'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    }
                    self._save_apartment(apartment_detail)
            sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url)
            Dao.execute_dmls(sql)
        except Exception as  e:
            print(e)
            sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url)
            Dao.execute_dmls(sql)

    def _insert_community(self):
        result = "INSERT INTO  communities (ORIGINAL_URL,NAME,AREA_NAME,LATITUDE,LONGITUDE,address,source_id )" \
                 " VALUES ('{}','{}','{}','{}','{}' ,'{}','{}' )".format(self._community_detail["url"],
                                                                         self._community_detail["name"],
                                                                         self._community_detail["area_name"],
                                                                         self._community_detail["latitude"],
                                                                         self._community_detail["longitude"],
                                                                         self._community_detail["location"],
                                                                         self.source_id)
        return result


    def _insert_apartment(self, apartment_detail):
        result = "INSERT INTO  apartments (COMMUNITY_ID  , APARTMENT_NUM ,STATUS ,FLOOR_NUM,create_time  )" \
                 " VALUES ('{}','{}','{}','{}','{}'  )".format(apartment_detail["COMMUNITY_ID"],
                                                               apartment_detail["APARTMENT_NUM"],
                                                               apartment_detail["STATUS"],
                                                               apartment_detail["FLOOR_NUM"],
                                                               apartment_detail["create_time"])
        return result


    def _save_apartment(self, apartment_detail):
        # 表中是否已有记录
        query_sql = "SELECT * FROM apartments WHERE COMMUNITY_ID  = {}  and FLOOR_NUM ='{}'  and APARTMENT_NUM ='{}' ".format(
            int(apartment_detail["COMMUNITY_ID"]), apartment_detail["FLOOR_NUM"],
            apartment_detail["APARTMENT_NUM"])
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(str(apartment_detail["COMMUNITY_ID"]) +
                                                          apartment_detail["FLOOR_NUM"] +
                                                          apartment_detail["APARTMENT_NUM"]))
            return
        # 数据插入操作
        try:
            Dao.execute_dmls(self._insert_apartment(apartment_detail))
        except Exception as e:
            print(e)


    def _save_community(self):
        # 表中是否已有记录
        query_sql = "SELECT * FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} ".format(
            self._community_detail["url"], self.source_id)
        if Dao.execute_query(query_sql) is not None:
            print(" {} is already exists ,so next".format(self._community_detail["name"]))
            return Dao.execute_query(communityid_sql)[0][0]
        # 数据插入操作
        Dao.execute_dmls(self._insert_community())
        return Dao.execute_query(communityid_sql)[0][0]

    def _check_community(self, url):
        # 表中是否已有记录 完成的
        communityid_sql = "SELECT COMMUNITY_ID FROM communities WHERE ORIGINAL_URL = '{}' and source_id ={} and status = 2 ".format(
            url, self.source_id)
        result = Dao.execute_query(communityid_sql)
        if result == None:
            return False
        return True


    def run(self):
        # for i in range(self.min_page, self.max_page):
        for i in range(363, 397):
            url = "http://newhouse.hfhome.cn/hffd_xkz.aspx?page={}".format(i)
            self._visit_pages(url)