def count_done_restaurants(): db = get_db() query = { "done": True, } rs = db.find(restaurants_collection, query) return rs.count()
def count_todo_restaurants(): db = get_db() query = { 'done': None, } rs = db.find(restaurants_collection, query) return rs.count()
def count_restaurant_by_city(): db = get_db() statistic_collection = 'ifood_statistic' links = db.find(statistic_collection) print(links) for l in links: link = l.get('link') print(link) query1 = { 'url': { '$regex': "^{}.*?".format(link), } } print(query1) count = db.find(restaurants_collection, query1).count() print(count) query2 = { 'link': link, } form = { "$set": { VERSION: count, } } db.update_one(statistic_collection, query2, form, upsert=True)
def count_todo_links(): db = get_db() query = { 'done': None, } rs = db.find(links_collection, query) return rs.count()
def count_done_links(): db = get_db() query = { "done": True, } rs = db.find(links_collection, query) return rs.count()
def mark_restaurant_done(restaurant): db = get_db() query = { "rid": restaurant.get('rid'), } update = { "doing": True, "done": True, } db.find_and_mark(restaurants_collection, query, update)
def mark_link_done(link): db = get_db() query = { "url": link.get('url'), } update = { "doing": True, "done": True, } db.find_and_mark(links_collection, query, update)
def init_statistic_collection(): db = get_db() statistic_collection = 'sindelantal_statistic' links = db.find(links_collection) for link in links: input = { 'city_name': link.get('title'), 'link': link.get('url').strip(), } db.insert_one(statistic_collection, input)
def unmark_doing(collection_name): db = get_db() query = { 'doing': True, 'done': None, } update = { "$set": { 'doing': None, } } rs = db.update_many(collection_name, query, update) return rs.modified_count
def insert_restaurants(restaurants): db = get_db() # 要求rid不重复 # condition = ['rid'] # db.insert_many(restaurants_collection, restaurants, condition) # 既然要求rid 不重复, 在插入大量是 使用update配合upsert, 如果存在则更新, 不存在则插入 for r in restaurants: query = { "rid": r.get('rid'), } update = { "$set": r, } db.update_one(restaurants_collection, query, update, True)
def init_statistic_collection(): db = get_db() statistic_collection = 'ifood_statistic' file_path = '../input/links.csv' with open(file_path, 'r') as f: links = f.readlines() for link in links: print(link) city_name = get_name_from_link(link) input = { 'link': link.strip(), 'city_name': city_name, } db.insert_one(statistic_collection, input)
def count_all_restaurants(): db = get_db() rs = db.find(restaurants_collection) return rs.count()
'Evt_Land_Ty_Val', 'Dh_Img_Host', 'Dh_Img_Path', 'Dh_Img_File', 'Review_Cnt_Latest', 'Review_Cnt_Ceo_Latest', 'Review_Cnt_Ceo_Say_Latest', 'Review_Cnt_Img', 'Review_Cnt_Ceo', 'Review_Cnt_Ceo_Say', 'Comp_No', 'Comp_Nm', 'Dh_Rgn_Ty_Cd', 'Mov_Url', 'Contract_Standard_Fee', 'Contract_Sale_Fee', 'Contract_Sale_Fee_YN', 'Noncontract_Standard_Fee', 'Noncontract_Sale_Fee', 'Noncontract_Sale_Fee_YN', 'Contract_Shop_Yn', 'Baemin_Kitchen_Yn', 'Shop_Prom', 'Ceo_Notice', 'Ad_Yn', 'Meet_Cash', 'Meet_Card', 'Dlvry_Tm', 'Close_Day_Tmp', 'Award_Type', 'Award_Info', 'Cache', 'Live_Yn_Shop', 'Shop_Cpn_Info', 'Shop_Cpn_Yn', 'Live_Yn_Ord', 'Shop_Break_Yn', 'Break_Tm_Info', 'Favorite_Yn', 'Distance', 'Distance_Txt', 'badge', 'sanitation' ] if __name__ == '__main__': # 创建链接mongo数据库对象 mongoclient = get_db() # 获取所有的数据 data_info = mongoclient.all_items('baemin') # 将数据保存至csv文件 raw_file_name = 'raw.csv' rawFile = file_path + web_name + raw_file_name rawpath = file_path + web_name if not os.path.exists(rawpath): os.makedirs(rawpath) with open(rawFile, 'a', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=header) if not os.path.getsize(rawFile): writer.writeheader()
def all_addresses(): db = get_db() table = 'addresses' column = '*' result = db.select(table, column) return list(result)
def find_todo_link(): db = get_db() query = {"doing": None} update = {"doing": True} link = db.find_one_and_mark(links_collection, query, update) return link
import json from mongodb_utils import Mongodb, get_db from utils import VERSION db = get_db() file_path = '../input/links.csv' with open(file_path, 'r') as f: result = f.readlines() links_collection = 'links_{}'.format(VERSION) dict_links = [] count = 1 for l in result: data = { 'id': count, 'url': l.strip(), } dict_links.append(data) count += 1 db.insert_many(links_collection, dict_links, ['url'])
import os import csv from config import DATE from mongodb_utils import get_db from utils import VERSION, WEB_NAME, fields city_list = ['hk', 'au', 'tw'] filepath = os.path.join(os.path.dirname(__file__), '../crawlerOutput/{}/{}/'.format(VERSION, WEB_NAME)) # 创建链接数据库的对象 client_mongo = get_db() if __name__ == '__main__': for city in city_list: info_list = client_mongo.all_items(city) if len(info_list) == 0: continue print('导出%s%s数据' % (DATE, city)) filename = os.path.join(filepath, city + '.csv') with open(filename, 'a', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=fields) if not os.path.getsize(filename): writer.writeheader() for info in info_list: writer.writerow(info)
# -*- coding: utf-8 -*- import csv import datetime import os import time import requests import sys import traceback import importlib from mongodb_utils import get_db importlib.reload(sys) # 创建链接数据库的对象 mongo_client = get_db() headers = { 'Connection': 'keep-alive', 'Accept': '*/*', 'User-Agent': 'foodpanda/2.15 (iPhone; iOS 10.1.1; Scale/3.00)', 'X-FP-API-KEY': 'iphone', 'App-version': 'foodpanda_2.15', } date = datetime.datetime.now().strftime("%Y-%m-%d") file_path = os.path.dirname(os.path.split( os.path.realpath(__file__))[0]) + '/crawlerOutput/' + date + '/' web_name = 'foodpanda/' raw_file_name = 'raw.csv' cuisine_file_name = 'cuisine.csv' deduplicate_file_name = 'deduplicate.csv'
import os import csv import json from utils import VERSION from mongodb_utils import get_db # 获取链接数据库的对象 c_mongo = get_db() # 表名称 table_name = VERSION + 'comment' # 文件保存位置 filepath = '../data/{}/comments/'.format(VERSION) # 评论文件头 comments_header = [ 'source', 'comment_name', 'comment_date', 'comment_rating', 'comment_desc', 'comment_reply', 'rating_name', 'rating_value', 'rating_date' ] if __name__ == '__main__': if not os.path.exists(filepath): os.makedirs(filepath) # 获取表中的所有数据信息 data_dict = c_mongo.all_items(table_name) for data in data_dict: # 评论文件的文件名称 comment_name = data['comment_name'] print(comment_name) # 评论内容
def count_all_links(): db = get_db() rs = db.find(links_collection) return rs.count()
def find_todo_restaurant(): db = get_db() query = {"doing": None} update = {"doing": True} r = db.find_one_and_mark(restaurants_collection, query, update) return r
def update_restaurant(r): db = get_db() query = {'rid': r.get('rid')} db.find_one_and_mark(restaurants_collection, query, r)
def update_links_to_db(links): db = get_db() links_collection = 'links_{}'.format(VERSION) db.insert_many(links_collection, links, ['url'])