class Scheduler(object): def __init__(self): self.crawler = Crawler() self.db = Mongo() def run(self, url, date): """ 开始爬取并保存 :param url: 爬取url :param date: 爬取时间 :return: """ data = self.crawler.main(url) data['date'] = date self.db.add(data) def main(self, url, h=1, m=0): """ 主程序入口 :param url: 爬取url :param h: 执行程序的时间 :param m:执行程序的时间 :return: """ while True: now = datetime.datetime.now() date = str(now.year) + '-' + str(now.month) + '-' + str(now.day) if now.hour == h and now.minute == m: self.run(url, date) # 每个60秒检测一次 time.sleep(60)
def __init__(self): if 'BLAST_TEXT_DB_SERVICE_HOST' in os.environ: self._db = Mongo(os.environ['MONGODB_USER'], \ os.environ['MONGODB_PASSWORD'], \ os.environ['BLAST_TEXT_DB_SERVICE_HOST'], \ os.environ['BLAST_TEXT_DB_SERVICE_PORT']) else: self._db = Mongo('user', 'password', 'localhost', '27017')
def __init__(self): if 'TEXT_DB_SERVICE_HOST' in os.environ: self.db = Mongo(os.getenv('MONGODB_USER'), \ os.getenv('MONGODB_PASSWORD'), \ os.getenv('TEXT_DB_SERVICE_HOST'), \ os.getenv('TEXT_DB_SERVICE_PORT')) else: self.db = Mongo('user', 'password', 'localhost', '27017')
def create_graph(): ''' create a vtuber relationd graph ''' PAGE_SIZE = 20 vgraph = networkx.Graph() client = Mongo(CONFIG["mongo"]["addr"], 'youtube') vtubers = client.loadWholeDoc('vtuber') video_num = client.loadWholeDoc('videosv2').count() for i in range(0, video_num, PAGE_SIZE): v_page = client.loadWholeDoc('videosv2').skip(i).limit(PAGE_SIZE) for v in v_page: owner = v['channelId'] try: desc = v['description'] except KeyError: # some times des is not exist in video continue extract_relation(owner, desc, vgraph) # preview # networkx.draw(vgraph, with_labels=True, font_weight='bold') # plt.show() # split the name from vtb list name_dict = {} vtb_dict = {} for v in vtubers: name = v['channel'] try: channel_id = v['channel_url'].split('/')[-1] except: continue name_dict[channel_id] = name vtb_dict[channel_id] = v # update the position and size of node in graph pos = networkx.random_layout(vgraph) # clean out all node who is not in vtuber list remove_list = [] for _n in vgraph.nodes(): if not _n in name_dict.keys(): remove_list.append(_n) vgraph.remove_nodes_from(remove_list) for _n in vgraph.nodes(): vgraph.node[_n]['viz']['position'] = { 'x': pos[_n][0] * (-100) * 1.5, 'y': pos[_n][1] * 100, 'z': 0 } # print(math.log2(vtb_dict[_n]['regsit'])) #vgraph.node[_n]['viz']['size'] = vgraph.degree[_n] vgraph.node[_n]['viz']['size'] = (math.log2(vtb_dict[_n]['regsit']) - 10) * 10 vgraph = networkx.relabel_nodes(vgraph, name_dict) # networkx.write_gexf(vgraph, '../data/vtb.gexf') return vgraph
def __init__(self): if 'DATABASE_SERVICE_HOST' in os.environ: self.db = Mongo(os.getenv('MONGODB_USER'), \ os.getenv('MONGODB_PASSWORD'), \ os.getenv('DATABASE_SERVICE_HOST'), \ os.getenv('DATABASE_SERVICE_PORT'), os.getenv('MONGODB_DATABASE')) else: self.db = Mongo('user', 'password', 'localhost', '27017', 'catcatgo_db')
def rejectReq(userName: str, client: Mongo): user: dict = client.get_doc({"username": userName}, getenv('PENDING_USER_COLLECTION')) if user == None: return "Username doesn't exists", False else: ret, msg = client.remove({"username": userName}, getenv('PENDING_USER_COLLECTION')) if ret == True: return "User removed from the pending list", True else: return "Error ocurred while removing user from the pending list", False
def userExists(userName: str, client: Mongo): # Checking if that user is existed in User Section if client.isDocExists({'username': f'{userName.strip()}'}, getenv('USER_COLLECTION')): return True, False, False # Checking if that user is existed in Pending Section if client.isDocExists({'username': f'{userName.strip()}'}, getenv('PENDING_USER_COLLECTION')): return True, True, False # Checking if that user is existed in Manager Section if is_manager(userName, client): return True, False, True return False, None, None
def approveReq(userName: str, client: Mongo): user: dict = client.get_doc({"username": userName}, getenv('PENDING_USER_COLLECTION')) if user == None: return "Username doesn't exists", False else: x = user.copy() p_id = x.pop('_id') if client.isDocExists({"username": x.get('username')}, getenv('USER_COLLECTION')): return "User is already exists in main list", False u_id = client.insert([x], getenv('USER_COLLECTION')) if u_id == None: return "Error Occured", False else: client.remove_ById(p_id, getenv('PENDING_USER_COLLECTION')) return "User added to main list", True
def isPassCorrect(userName: str, password: str, client: Mongo): block = client.get_doc({"username": userName}, getenv('USER_COLLECTION')) if block == None: return False if block.get('password') == password: return True else: return False
def getNcloudConfig(DB: Mongo): data: dict = DB.get_doc({}, getenv('ADMIN_COLLECTION')) if data == None: return None return { "addall_inNewHosts": data.get('addall_inNewHosts'), "autoStartSrvr": data.get('autoStartSrvr'), "allowRegistration": data.get('allowRegistration'), "pendingNewUser": data.get('pendingNewUser') }
class Scheduler(object): def __init__(self): self.crawler = Crawler() self.db = Mongo(MONGO_DB) def main(self): """ 程序主逻辑函数 :return: """ self.db.add(MONGO_COLLECTION_URL, {'url': START_URL}) while self.db.count(MONGO_COLLECTION_URL) > 0: url = self.db.remove_one(MONGO_COLLECTION_URL)['url'] userinfo, new_urls = self.crawler.main(url) if userinfo or new_urls: self.db.add(MONGO_COLLECTION_USERINFO, userinfo) for new_url in new_urls: self.db.add(MONGO_COLLECTION_URL, {'url': new_url}) else: self.db.add(MONGO_COLLECTION_USERINFO, { 'user_url': url, 'declare': '该账户可能已经注销' })
def gen_mock_data(): ''' generate fake data for test ''' client = Mongo(CONFIG["mongo"]["addr"], 'youtube') data = client.loadWholeDoc('vtuber') graph = networkx.Graph() vtubers = [] for i in range(10): vtubers.append(data[i]['channel']) graph.add_node(data[i]['channel'], viz={}, mod=1, id=0) pos = networkx.random_layout(graph) counter = 0 for v in vtubers: graph.node[v]['id'] = counter graph.node[v]['viz']['size'] = 20 graph.node[v]['viz']['position'] = { 'x': pos[v][0] * (-100), 'y': pos[v][1] * 100, 'z': 0 } graph.node[v]["viz"]['color'] = {'r': 255, 'g': 192, 'b': 201, 'a': 1} print(graph.node[v]) counter = counter + 1 graph.add_weighted_edges_from([(vtubers[1], vtubers[5], 3), (vtubers[2], vtubers[4], 2), (vtubers[1], vtubers[3], 3), (vtubers[1], vtubers[7], 1), (vtubers[5], vtubers[9], 3), (vtubers[4], vtubers[8], 3), (vtubers[1], vtubers[8], 3), (vtubers[1], vtubers[2], 3), (vtubers[6], vtubers[7], 2)]) #graph = networkx.generate_gexf(graph) # regen id networkx.write_gexf(graph, '../data/mock.gexf')
class BlastText(Resource): def __init__(self): if 'BLAST_TEXT_DB_SERVICE_HOST' in os.environ: self._db = Mongo(os.environ['MONGODB_USER'], \ os.environ['MONGODB_PASSWORD'], \ os.environ['BLAST_TEXT_DB_SERVICE_HOST'], \ os.environ['BLAST_TEXT_DB_SERVICE_PORT']) else: self._db = Mongo('user', 'password', 'localhost', '27017') def get(self, text): items = [] for obj in self._db.get(text): items.append({'id': str(obj['_id']), 'url': obj['url'], 'text': obj['text']}) return items
def compute_ratings_matrix(ratings_matrix_file): """ Computes the rating matrix Input: ratings_matrix_file: Filename output rating matrix """ mongo = Mongo('Acme-Supermarket') mongo.connect() matrix_file = ratings_matrix_file hdf5_matrix = tables.openFile(matrix_file, mode='w') filters = tables.Filters(complevel=5, complib='blosc') products = mongo.database.products.find({}, {'_id': 1}) products = [p['_id'] for p in products] products = numpy.concatenate((numpy.array([-1]), products)) products_count = mongo.database.products.count() customers = mongo.database.actors.find({'_type': 'Customer'}, {'_id': 1}) customers = [c['_id'] for c in customers] customers_count = mongo.database.actors.count({'_type': 'Customer'}) data_storage = hdf5_matrix.createEArray(hdf5_matrix.root, 'data', tables.UInt32Atom(), shape=(0, products_count + 1), filters=filters, expectedrows=customers_count) data_storage.append(products[:][None]) for customer_id in customers: # Each column 0: Customer IDs # Product ratings in columns 1+ row = numpy.zeros((products_count + 1, )) row[0] = customer_id ratings = mongo.database.rates.find({'customer_id': customer_id}, { 'product_id': 1, 'value': 1 }) for rating in ratings: row[numpy.where( products == rating['product_id'])[0][0]] = rating['value'] data_storage.append(row[:][None]) hdf5_matrix.close() mongo.disconnect() return matrix_file
class BlastVideo(Resource): def __init__(self): if 'VIDEO_DB_SERVICE_HOST' in os.environ: self.db = Mongo(os.getenv('MONGODB_USER'), \ os.getenv('MONGODB_PASSWORD'), \ os.getenv('VIDEO_DB_SERVICE_HOST'), \ os.getenv('VIDEO_DB_SERVICE_PORT')) else: self.db = Mongo('user', 'password', 'localhost', '27017') def get(self, tag): items = [] try: for obj in self.db.get(tag): items.append({'id': str(obj['_id']), 'url': obj['url'], 'title': obj['title']}) except Exception as e: print(e, file=sys.stderr) return items
def main(): fname = config.log_path + 'article_parse.' + time.strftime("%Y%m%d") log.set_logger(level='DEBUG', when="D", limit=1, filename=fname) alist = Mongo().scan() if not alist: log.warn("no articles in mongodb") return False MyObj = Mysql() mobj = Mongo() for doc in alist: if Parse(MyObj).do(doc): mobj.update(doc.get("_id"), done=1) log.info("insert mysql success, url:%s" % doc.get('url')) else: mobj.update(doc.get("_id"), done=-1) log.warning("insert mysql failure, task_id:%s, url:%s" % (doc.get('taskid'), doc.get('url')))
class AcmeSupermarket: def __init__(self, transactions_filepath): self.schema = 'Acme-Supermarket-Recommendations' self.transactions_filepath = transactions_filepath self.database = Mongo(self.schema) def load(self): self.database.connect() purchases = self.database.database.purchases.find() transactions = numpy.array([]) i = 0 row_starts = numpy.array([0]) for purchase in purchases: i += 1 purchase_id = purchase['_id'] purchase_lines = self.database.database.purchase_lines.find({'purchase_id': purchase_id}) transaction = numpy.array([line['product_id'] for line in purchase_lines], dtype='i4') row_starts = numpy.append(row_starts, row_starts[-1] + transaction.size) transactions = numpy.concatenate((transactions, transaction)) row_ends = numpy.concatenate((row_starts, [transactions.size])) lengths = numpy.diff(row_ends) pad_lengths = numpy.max(lengths) - lengths pad_indices = numpy.repeat(row_ends[1:], pad_lengths) transactions_padded = numpy.insert( transactions, pad_indices, -1).reshape(-1, numpy.max(lengths)) numpy.save(self.transactions_filepath, transactions_padded) self.database.close() def save_rules(self, rules): self.database.save_rules(rules)
def getAllUsers(client: Mongo): return client.get_docs({}, getenv('USER_COLLECTION'))
from googleapiclient.discovery import build from googleapiclient.errors import HttpError from google_auth_oauthlib.flow import InstalledAppFlow from db import Mongo, UseCache from util import load_config CONFIG = load_config() SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl'] API_SERVICE_NAME = 'youtube' API_VERSION = 'v3' CLIENT_SECRETS_FILE = "client_secret_974874009372-e5f329adu30h1j8qmtc75r5vvgdngvbr.apps.googleusercontent.com.json" DATABASE = Mongo(CONFIG["mongo"]["addr"], 'youtube') class YouTube: ''' this is the manager entity for youtube api operation ''' _db = None def __init__(self, secret): # get OAuth2 credential info # self._db = database flow = InstalledAppFlow.from_client_secrets_file(secret, SCOPES) credentials = flow.run_local_server() self.service = build(API_SERVICE_NAME, API_VERSION,
import os from gensim.models import Word2Vec, KeyedVectors, FastText from db import Mongo DB = Mongo() # 데이터 전처리 ################################################# db_result = list(DB.cursor()['gallery'].find({"pass": 1})) result = [post['join'] for post in db_result] #### HyperParameter vec_size = 6 windows = 6 min_count = 10 iteration = 100 workers = 4 model = FastText(sentences=result, size=vec_size, window=windows, min_count=min_count, iter=iteration, workers=workers) model_result = model.wv.most_similar("AMD 5800X") print(model_result)
from translator import Translator from scraper import Scraper from selenium import webdriver from db import Mongo import datetime import time import requests import subprocess if __name__ == "__main__": driver = webdriver.Firefox() scraper = Scraper(driver) translator = Translator(driver) assets = ['itau', 'ambev', 'petrobras'] db = Mongo(assets) today = datetime.datetime(2019, 4, 9) # Mudar a janela de tempo end_date = datetime.datetime(2020, 1, 1) total_days = (end_date - today).days for i in range(total_days): days_left = (end_date - today).days progress = ((i + 1) / (total_days * 1.0)) * 100 print(f"Today is {today.strftime('%d/%m/%Y')}. There are {days_left} days left") print(f"Progress: {progress}%") for asset in assets: data = scraper.scrape_requests(asset, today)
"""Web application.""" import json import os from flask import abort, Flask, jsonify, request import pika from db import Mongo app = Flask(__name__) mongo = Mongo(app) @app.errorhandler(400) def bad_request(e): """Jsonify 400 response.""" resp = jsonify({'error': str(e)}) resp.status_code = 400 return resp @app.route('/api/sms', methods=['GET', 'POST']) def get_sms(): """SMS endpoint for retrieving and sending messages.""" if request.method == 'POST': data = request.get_json() if 'message' not in data or 'phone' not in data: abort(400)
def getServers(DB: Mongo): block: dict = DB.get_docs({}, getenv('SERVER_COLLECTION')) NN = Nas(block) return NN.getBlock()
this script is use for collect data on https://mamedaifuku.sakura.ne.jp/ ''' import re from urllib import request import json import bs4 from db import Mongo, UseCache from util import load_config CONFIG = load_config() COMMENT_URL_BASE = 'https://mamedaifuku.sakura.ne.jp/live_stream/php/ex_disp_message.php?v={}&disp_message_info_mode=2&disp_message_author_mode=1&disp_message_comment_mode=1&ym=&turning_page_mode=true&message_page=1' COMMENT_FANS = "commFans" database = Mongo(CONFIG["mongo"]["addr"], 'youtube') def get_video_list(): ''' fecth all video from all users ''' with request.urlopen( 'https://mamedaifuku.sakura.ne.jp/live_stream/php/ex_return_video_list_json.php?get_video_list_mode=all' ) as res: # with open('../data/video.json', 'w+') as tmp: # tmp.write(res.read().decode()) # database = Mongo(CONFIG["mongo"]["addr"], 'youtube') data = res.read().decode() # print(data) database.saveBulkToDoc('videosv2', json.loads(data))
def __init__(self): self.crawler = Crawler() self.db = Mongo()
def __init__(self, encrypted_account_id): self.logger = Logger.Logger() self.persistency = Mongo.Mongo(self.logger) self.api = RiotClient.RiotClient(self.logger) self.encrypted_account_id = encrypted_account_id
def __init__(self, transactions_filepath): self.schema = 'Acme-Supermarket-Recommendations' self.transactions_filepath = transactions_filepath self.database = Mongo(self.schema)
import re from flask import Blueprint, jsonify, request from config.config import Config from db import Mongo from app.paginate import Paginate bp_rent = Blueprint('rent', __name__, url_prefix='/') mongo = Mongo('192.168.99.100', 27017, 'rent591', 'houses').client() @bp_rent.route('/' + Config.API_BASE_PATH + '/ans1', methods=['GET']) def api_ans1(): page = request.args.get('page', 1) gender = request.args.get('gender', '男生') region = request.args.get('region', '3') gender_limit = '男生' if gender == '女生' else '女生' query = { 'gender_limit': { '$ne': gender_limit }, 'region': int(region), } result = mongo.find(query, {"_id": 0}) p = Paginate(result, current_page=int(page)) response = { 'current_page': page,
def userExists(userName: str, client: Mongo): if client.isDocExists({'username': f'{userName.strip()}'}, getenv('USER_COLLECTION')): return True else: return False
class HouseRentCrawler: """ Crawl all houses data which in "TARGET_REGIONS" """ TARGET_REGIONS = { '台北市': 1, '新北市': 3, } HOUSE_ID_QUEUE = Queue() HOUSE_DETAIL_QUEUE = multiprocessing.Queue() MONGO_DB = Mongo('192.168.99.100', 27017, 'rent591', 'houses') ES = Elastic('192.168.99.100', 9200, 'rent591', 'houses') def __init__(self, async_tasks_cnt=2): self.target_endpoint = 'https://rent.591.com.tw/' self.async_tasks_cnt = async_tasks_cnt def start(self): threading.Thread(target=self._start_get_house_ids).start() threading.Thread(target=self._start_get_house_detail).start() pool = multiprocessing.Pool(processes=3) while True: try: region_id, house_detail_info = self.HOUSE_DETAIL_QUEUE.get(block=True, timeout=60) pool.apply_async(self.save_house_data, args=(region_id, house_detail_info)) except Empty: break except Exception: continue def _start_get_house_ids(self): tasks = [] loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) for k in self.TARGET_REGIONS: tasks.append( asyncio.ensure_future( self._get_house_ids(self.TARGET_REGIONS[k]) ) ) loop.run_until_complete(asyncio.gather(*tasks)) loop.close() def _start_get_house_detail(self): tasks = [] loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) for i in range(self.async_tasks_cnt): tasks.append(self._get_house_detail()) loop.run_until_complete(asyncio.gather(*tasks)) loop.close() async def _get_house_ids(self, region_id): """ Get "totalRows" at first cycle (without "firstRow" & "totalRows" in params), then get all data until "firstRow" grater than "totalRows" """ prd = self._get_pre_request_data(region_id) cookies = prd['cookies'] headers = {'X-CSRF-TOKEN': prd['csrf_token']} params = {'kind': 0, 'region': region_id, 'type': 1, 'searchtype': 1} async with aiohttp.ClientSession(cookies=cookies, headers=headers) as session: while ( params.get('firstRow', 0) < int(params.get('totalRows', '0')) or not params.get('firstRow') ): try: async with session.get(f'{self.target_endpoint}home/search/rsList', params=params) as response: rslist = await response.json() house_ids = [self._get_house_id(data) for data in rslist.get('data').get('data')] for house_id in house_ids: self.HOUSE_ID_QUEUE.put((region_id, house_id)) if params.get('firstRow'): params['firstRow'] += 30 else: params.update({ 'firstRow': 30, 'totalRows': rslist.get('records', '0').replace(',', ''), }) except Exception: print(f'error: {params}') continue async def _get_house_detail(self): while True: try: region_id, house_id = self.HOUSE_ID_QUEUE.get(block=True, timeout=10) async with aiohttp.ClientSession() as session: async with session.get(f'{self.target_endpoint}rent-detail-{house_id}.html') as response: self.HOUSE_DETAIL_QUEUE.put((region_id, await response.text())) except Empty: break except Exception: print(f'error: {house_id}') continue @classmethod def save_house_data(cls, region_id, house_detail_info): data = HouseParser(house_detail_info, region_id).get_house_info() cls.MONGO_DB.save(data) cls.ES.save(data) def _get_pre_request_data(self, region_id): cookies = {'urlJumpIp': region_id} params = {'kind': 0, 'region': region_id} resp = requests.get(self.target_endpoint, params=params) soup = BeautifulSoup(resp.text, 'html.parser') csrf_token = soup.find("meta", attrs={"name": "csrf-token"}).get('content') cookies.update(resp.cookies) return {'csrf_token': csrf_token, 'cookies': cookies} def _get_house_id(self, data): return data.get('houseid')
authlist = CONFIG["twitter"]["auth"] twiAuthList = list( map( lambda a: twitter.oauth.OAuth(a["oauth_token"], a[ "oauth_token_secret"], a["consumer_key"], a["consumer_secret"]), authlist)) special = [ '914724274274832384', '803480007775383552', '1019885045933211648', '984028782175629314' ] black = [746964642660966403] twiList = list(map(lambda a: twitter.Twitter(auth=a), twiAuthList)) db = Mongo(mongoHost, "twitter") class UserNotFoundException(Exception): def __init__(self, value): self.value = value def __str__(self): return (repr(self.value) + ' is not found.') @UseCache(db=db, keyword='user_id') def getUserProfile(apilist, user_id=''): ''' get the profile of a user with given id '''
from monitoring import Logger from riot_api import PlayerCrawler from db import Mongo # ARAM queue_id = 450 logger = Logger.Logger() persistency = Mongo.Mongo(logger) # seed for crawling seed_encrypted_account_id = '-w9INIopYVNjHShnEGGgdYhREGEW407RXfAG6ltIjfEi_g' PlayerCrawler.PlayerCrawler(seed_encrypted_account_id).run() while True: match = persistency.get_uncrawled_game(queue_id) for participant_identity in match['participantIdentities']: encrypted_account_id = participant_identity['player'][ 'currentAccountId'] if encrypted_account_id == seed_encrypted_account_id: pass else: PlayerCrawler.PlayerCrawler(encrypted_account_id).run() persistency.mark_match_as_crawled(match['gameId'])