예제 #1
0
def count_done_restaurants():
    db = get_db()
    query = {
        "done": True,
    }
    rs = db.find(restaurants_collection, query)
    return rs.count()
예제 #2
0
def count_todo_restaurants():
    db = get_db()
    query = {
        'done': None,
    }
    rs = db.find(restaurants_collection, query)
    return rs.count()
예제 #3
0
def count_restaurant_by_city():
    db = get_db()
    statistic_collection = 'ifood_statistic'
    links = db.find(statistic_collection)
    print(links)
    for l in links:
        link = l.get('link')
        print(link)
        query1 = {
            'url': {
                '$regex': "^{}.*?".format(link),
            }
        }
        print(query1)
        count = db.find(restaurants_collection, query1).count()
        print(count)

        query2 = {
            'link': link,
        }
        form = {
            "$set": {
                VERSION: count,
            }
        }
        db.update_one(statistic_collection, query2, form, upsert=True)
예제 #4
0
def count_todo_links():
    db = get_db()
    query = {
        'done': None,
    }
    rs = db.find(links_collection, query)
    return rs.count()
예제 #5
0
def count_done_links():
    db = get_db()
    query = {
        "done": True,
    }
    rs = db.find(links_collection, query)
    return rs.count()
예제 #6
0
def mark_restaurant_done(restaurant):
    db = get_db()
    query = {
        "rid": restaurant.get('rid'),
    }
    update = {
        "doing": True,
        "done": True,
    }
    db.find_and_mark(restaurants_collection, query, update)
예제 #7
0
def mark_link_done(link):
    db = get_db()
    query = {
        "url": link.get('url'),
    }
    update = {
        "doing": True,
        "done": True,
    }
    db.find_and_mark(links_collection, query, update)
def init_statistic_collection():
    db = get_db()
    statistic_collection = 'sindelantal_statistic'
    links = db.find(links_collection)
    for link in links:
        input = {
            'city_name': link.get('title'),
            'link': link.get('url').strip(),
        }
        db.insert_one(statistic_collection, input)
예제 #9
0
def unmark_doing(collection_name):
    db = get_db()
    query = {
        'doing': True,
        'done': None,
    }
    update = {
        "$set": {
            'doing': None,
        }
    }
    rs = db.update_many(collection_name, query, update)
    return rs.modified_count
예제 #10
0
def insert_restaurants(restaurants):
    db = get_db()
    # 要求rid不重复
    # condition = ['rid']
    # db.insert_many(restaurants_collection, restaurants, condition)
    # 既然要求rid 不重复, 在插入大量是 使用update配合upsert, 如果存在则更新, 不存在则插入
    for r in restaurants:
        query = {
            "rid": r.get('rid'),
        }
        update = {
            "$set": r,
        }
        db.update_one(restaurants_collection, query, update, True)
예제 #11
0
def init_statistic_collection():
    db = get_db()
    statistic_collection = 'ifood_statistic'
    file_path = '../input/links.csv'
    with open(file_path, 'r') as f:
        links = f.readlines()
    for link in links:
        print(link)
        city_name = get_name_from_link(link)
        input = {
            'link': link.strip(),
            'city_name': city_name,
        }
        db.insert_one(statistic_collection, input)
예제 #12
0
def count_all_restaurants():
    db = get_db()
    rs = db.find(restaurants_collection)
    return rs.count()
예제 #13
0
    'Evt_Land_Ty_Val', 'Dh_Img_Host', 'Dh_Img_Path', 'Dh_Img_File',
    'Review_Cnt_Latest', 'Review_Cnt_Ceo_Latest', 'Review_Cnt_Ceo_Say_Latest',
    'Review_Cnt_Img', 'Review_Cnt_Ceo', 'Review_Cnt_Ceo_Say', 'Comp_No',
    'Comp_Nm', 'Dh_Rgn_Ty_Cd', 'Mov_Url', 'Contract_Standard_Fee',
    'Contract_Sale_Fee', 'Contract_Sale_Fee_YN', 'Noncontract_Standard_Fee',
    'Noncontract_Sale_Fee', 'Noncontract_Sale_Fee_YN', 'Contract_Shop_Yn',
    'Baemin_Kitchen_Yn', 'Shop_Prom', 'Ceo_Notice', 'Ad_Yn', 'Meet_Cash',
    'Meet_Card', 'Dlvry_Tm', 'Close_Day_Tmp', 'Award_Type', 'Award_Info',
    'Cache', 'Live_Yn_Shop', 'Shop_Cpn_Info', 'Shop_Cpn_Yn', 'Live_Yn_Ord',
    'Shop_Break_Yn', 'Break_Tm_Info', 'Favorite_Yn', 'Distance',
    'Distance_Txt', 'badge', 'sanitation'
]

if __name__ == '__main__':
    # 创建链接mongo数据库对象
    mongoclient = get_db()
    # 获取所有的数据
    data_info = mongoclient.all_items('baemin')

    # 将数据保存至csv文件
    raw_file_name = 'raw.csv'
    rawFile = file_path + web_name + raw_file_name

    rawpath = file_path + web_name
    if not os.path.exists(rawpath):
        os.makedirs(rawpath)

    with open(rawFile, 'a', encoding='utf-8', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=header)
        if not os.path.getsize(rawFile):
            writer.writeheader()
예제 #14
0
def all_addresses():
    db = get_db()
    table = 'addresses'
    column = '*'
    result = db.select(table, column)
    return list(result)
예제 #15
0
def find_todo_link():
    db = get_db()
    query = {"doing": None}
    update = {"doing": True}
    link = db.find_one_and_mark(links_collection, query, update)
    return link
예제 #16
0
import json

from mongodb_utils import Mongodb, get_db
from utils import VERSION

db = get_db()
file_path = '../input/links.csv'
with open(file_path, 'r') as f:
    result = f.readlines()

links_collection = 'links_{}'.format(VERSION)
dict_links = []
count = 1
for l in result:
    data = {
        'id': count,
        'url': l.strip(),
    }
    dict_links.append(data)
    count += 1

db.insert_many(links_collection, dict_links, ['url'])
예제 #17
0
import os
import csv
from config import DATE
from mongodb_utils import get_db
from utils import VERSION, WEB_NAME, fields

city_list = ['hk', 'au', 'tw']

filepath = os.path.join(os.path.dirname(__file__),
                        '../crawlerOutput/{}/{}/'.format(VERSION, WEB_NAME))

# 创建链接数据库的对象
client_mongo = get_db()

if __name__ == '__main__':
    for city in city_list:
        info_list = client_mongo.all_items(city)
        if len(info_list) == 0:
            continue
        print('导出%s%s数据' % (DATE, city))
        filename = os.path.join(filepath, city + '.csv')
        with open(filename, 'a', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=fields)
            if not os.path.getsize(filename):
                writer.writeheader()
            for info in info_list:
                writer.writerow(info)
예제 #18
0
# -*- coding: utf-8 -*-
import csv
import datetime
import os
import time
import requests
import sys
import traceback
import importlib
from mongodb_utils import get_db

importlib.reload(sys)

# 创建链接数据库的对象
mongo_client = get_db()

headers = {
    'Connection': 'keep-alive',
    'Accept': '*/*',
    'User-Agent': 'foodpanda/2.15 (iPhone; iOS 10.1.1; Scale/3.00)',
    'X-FP-API-KEY': 'iphone',
    'App-version': 'foodpanda_2.15',
}

date = datetime.datetime.now().strftime("%Y-%m-%d")
file_path = os.path.dirname(os.path.split(
    os.path.realpath(__file__))[0]) + '/crawlerOutput/' + date + '/'
web_name = 'foodpanda/'
raw_file_name = 'raw.csv'
cuisine_file_name = 'cuisine.csv'
deduplicate_file_name = 'deduplicate.csv'
예제 #19
0
import os
import csv
import json
from utils import VERSION
from mongodb_utils import get_db

# 获取链接数据库的对象
c_mongo = get_db()

# 表名称
table_name = VERSION + 'comment'

# 文件保存位置
filepath = '../data/{}/comments/'.format(VERSION)

# 评论文件头
comments_header = [
    'source', 'comment_name', 'comment_date', 'comment_rating', 'comment_desc',
    'comment_reply', 'rating_name', 'rating_value', 'rating_date'
]

if __name__ == '__main__':
    if not os.path.exists(filepath):
        os.makedirs(filepath)
    # 获取表中的所有数据信息
    data_dict = c_mongo.all_items(table_name)
    for data in data_dict:
        # 评论文件的文件名称
        comment_name = data['comment_name']
        print(comment_name)
        # 评论内容
예제 #20
0
def count_all_links():
    db = get_db()
    rs = db.find(links_collection)
    return rs.count()
예제 #21
0
def find_todo_restaurant():
    db = get_db()
    query = {"doing": None}
    update = {"doing": True}
    r = db.find_one_and_mark(restaurants_collection, query, update)
    return r
예제 #22
0
def update_restaurant(r):
    db = get_db()
    query = {'rid': r.get('rid')}
    db.find_one_and_mark(restaurants_collection, query, r)
예제 #23
0
def update_links_to_db(links):
    db = get_db()
    links_collection = 'links_{}'.format(VERSION)
    db.insert_many(links_collection, links, ['url'])