os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_dailyeco_news", stream=True) logger = loghelper.get_logger("crawler_dailyeco_news") NEWSSOURCE = "dailyeco" RETRY = 3 TYPE = 60005 SOURCE = 13865 URLS = [] CURRENT_PAGE = 1 linkPattern = "cn.dailyeconomic.com/\w+/\d+/\d+/\d+/\d+.html" Nocontents = [] columns = [ # {"column": "jmd", "max": 2}, { "column": "投资", "max": 1
# -*- coding: utf-8 -*- import os, sys import datetime import xlwt reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, config, util, db #logger loghelper.init_logger("audi_export", stream=True) logger = loghelper.get_logger("audi_export") mongo = db.connect_mongo() collection = mongo.demoday.contest_company def get_topic(topic_id): if topic_id == 13: return u"人工智能的应用" if topic_id == 11: return u"数字化" if topic_id == 12: return u"车·生活" return "" def get_contest_company_stage_status(status):
import os, sys import time reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import db #logger loghelper.init_logger("patch_company_round", stream=True) logger = loghelper.get_logger("patch_company_round") def process(corporate_id): logger.info("corporate id: %s", corporate_id) conn = db.connect_torndb() funding = conn.get( "select * from funding where corporateId=%s and (active is null or active !='N') order by fundingDate desc limit 1", corporate_id) if funding is not None: # corporate = conn.get("select * from corporate where id=%s", corporate_id) # if corporate is not None: conn.update("update corporate set round=%s, roundDesc=%s where id=%s", funding["round"], funding["roundDesc"], corporate_id) else:
import time import gevent from gevent.event import Event from gevent import monkey monkey.patch_all() reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, config import db import util #logger loghelper.init_logger("domain_2_beian", stream=True) logger = loghelper.get_logger("domain_2_beian") #mongo mongo = db.connect_mongo() collection = mongo.info.beian BEIANS = [] def whoisCheck(): while True: if len(BEIANS) == 0: return beian = BEIANS.pop(0)
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, db, util import extract, extractArticlePublishedDate sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../spider2/crawler')) import BaseCrawler #logger loghelper.init_logger("deal_news_new", stream=True) logger = loghelper.get_logger("deal_news_new") class NewsCrawler(BaseCrawler.BaseCrawler): def __init__(self): BaseCrawler.BaseCrawler.__init__(self) def is_crawl_success(self, url, content): if content.find("</html>") == -1: return False return True news_crawler = NewsCrawler()
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_yxtl_news", stream=True) logger = loghelper.get_logger("crawler_yxtl_news") NEWSSOURCE = "youxituoluo" RETRY = 3 TYPE = 60005 SOURCE = 13846 URLS = [] CURRENT_PAGE = 1 linkPattern = "www.youxituoluo.com/\d+.html" Nocontents = [] columns = [ { "column": "new", "max": 2 },
import pymongo import datetime, time import re import pytz reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import config import util import proxy_pool #logger loghelper.init_logger("itunes_parser", stream=True) logger = loghelper.get_logger("itunes_parser") #mongo (mongodb_host, mongodb_port) = config.get_mongodb_config() mongo = MongoClient(mongodb_host, mongodb_port) itunes_collection = mongo.crawler_v2.market_itunes if __name__ == "__main__": logger.info("Start...") rexExp = re.compile(r"[一-龥]+") apps = itunes_collection.find({"name":rexExp, "parsed":{"$ne":True}}) logger.info("cnt: %s" % apps.count()) for app in apps: html = app.get("html")
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, extract, db, util, url_helper, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_Kr36_newfs", stream=True) logger = loghelper.get_logger("crawler_Kr36_newfs") #mongo mongo = db.connect_mongo() collection_news = mongo.article.news newsid = [] b_id = "" TYPE = 60008 def find_companyId(sourceId): if sourceId == "0" or sourceId == 0: return None
# -*- coding: utf-8 -*- import os, sys reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import db #logger loghelper.init_logger("collection_newproduct", stream=True) logger = loghelper.get_logger("collection_newproduct") def process(): logger.info("Process collection new product") conn = db.connect_torndb() cs = conn.query( "select * from company where (active is null or active='Y') and type=41020" ) for c in cs: rel = conn.get( "select * from collection_company_rel where (active is null or active='Y') and collectionId=21 and companyId=%s", c["id"]) if rel is None: conn.insert(
sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import helper sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/company/itjuzi')) import parser_db_util import itjuzi_helper #logger loghelper.init_logger("patch_36kr", stream=True) logger = loghelper.get_logger("patch_36kr") def remove_13030_funding(): conn = db.connect_torndb() fs = conn.query( "select * from source_funding where sourceCompanyId in (select id from source_company where source=13030)" ) for f in fs: conn.execute( "delete from source_funding_investor_rel where sourceFundingId=%s", f["id"]) conn.execute("delete from source_funding where id=%s", f["id"]) conn.close()
# -*- coding: utf-8 -*- import os, sys from pymongo import MongoClient import pymongo import datetime reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, config import db import xlwt #logger loghelper.init_logger("export_collection", stream=True) logger = loghelper.get_logger("export_collection") def getRound(round): if round is None: return "" if round < 1010: return "" if round < 1020: return "Angel" if round < 1030: return "Pre-A" if round < 1040: return "A" if round < 1050:
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper,extract,db, util,url_helper,download, extractArticlePublishedDate sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_xtecher_news", stream=True) logger = loghelper.get_logger("crawler_xtecher_news") NEWSSOURCE = "Xtecher" RETRY = 3 TYPE = 60001 SOURCE =13821 URLS = [] links=[] CURRENT_PAGE = 1 linkPattern = "Xfeature/view\?aid=\d+" Nocontents = [ ] columns = [ {"column": None, "max": 3}, ]
sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import config import util import proxy_pool import db import aggregator_util #logger loghelper.init_logger("appstore_rank", stream=True) logger = loghelper.get_logger("appstore_rank") #mongo (mongodb_host, mongodb_port) = config.get_mongodb_config() mongo = MongoClient(mongodb_host, mongodb_port) appstore_rank_collection = mongo.crawler_v2.appstore_rank total = 0 def request(url, callback): proxy = {'type': 'http', 'anonymity': 'high', 'ping': 1, 'transferTime': 5} # proxy = {'type': 'http', 'anonymity':'high'} proxy_ip = None while proxy_ip is None:
import json reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, desc_helper #logger loghelper.init_logger("news_domain", stream=True) logger = loghelper.get_logger("news_domain") #mongo mongo = db.connect_mongo() collection_news = mongo.article.news if __name__ == "__main__": results = list( collection_news.aggregate([{ "$group": { "_id": "$domain", "count": { "$sum": 1 }
import loghelper import config import util import proxy_pool import db sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import crawler_util sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import market.miapp as miapp_parser #logger loghelper.init_logger("miapp_trends", stream=True) logger = loghelper.get_logger("miapp_trends") #mongo mongo = db.connect_mongo() collection = mongo.trend.android collection_market = mongo.market.android_market #TODO cnt = 0 total = 0 TYPE = 16070 # def handle_search_result(response, app): # global total # global notfound #
init_kafka() # action: create, delete msg = {"source": action, "id": company_id, "detail": source} flag = False while flag is False: try: kafkaProducer.send_messages("task_company", json.dumps(msg)) flag = True except Exception, e: logger.exception(e) time.sleep(60) #logger loghelper.init_logger("sh_import", stream=True) logger = loghelper.get_logger("sh_import") def insert(shortname, name, brief, website): name = name.replace("(开业)", "") sourceId = util.md5str(name) sid = parser_db_util.save_company_yitai(shortname, name, 13100, sourceId, brief) logger.info("sid:%s->sourceId:%s", sid, sourceId) parser_db_util.save_source_company_name(sid, name, 12010) parser_db_util.save_source_company_name(sid, shortname, 12020) if website is not None and website.strip() != "": website = url_helper.url_normalize(website) if website is not None and website != "": if website.find("http://") == -1 and website.find("https://"):
# -*- coding: utf-8 -*- __author__ = 'arthur' #老的FA是通过source_company来标记的 import os, sys import time reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import loghelper, util, db, name_helper #logger loghelper.init_logger("set_endorse_tag_patch", stream=True) logger = loghelper.get_logger("set_endorse_tag_patch") if __name__ == "__main__": logger.info("Start...") conn = db.connect_torndb() cs = conn.query( "select * from source_company where source in " "(13100,13101,13102,13103,13104,13300,13301,13800) order by id") for c in cs: source = c["source"] dict = conn.get("select * from dictionary where value=%s", source) tag_id = None if dict["subTypeValue"] == 1301: #FA tag_id = 479289
reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import loghelper import config import util import proxy_pool import db #logger loghelper.init_logger("itunes_trends", stream=True) logger = loghelper.get_logger("itunes_trends") #mongo mongo = db.connect_mongo() collection = mongo.trend.itunes collection_itunes = mongo.market.itunes cnt = 0 total = 0 def request(url, callback): # proxy = {'type': 'https', 'anonymity':'high', 'ping':1, 'transferTime':5} proxy = {'type': 'http', 'anonymity': 'high'}
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_baijia_news", stream=True) logger = loghelper.get_logger("crawler_baijia_news") NEWSSOURCE = "Baijia" RETRY = 3 TYPE = 60001 SOURCE = 13829 URLS = [] CURRENT_PAGE = 1 linkPattern = "/article/\d+" Nocontents = [] columns = [ { "column": None, "max": 2 },
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate, oss2_helper sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_irui_report", stream=True) logger = loghelper.get_logger("crawler_irui_report") NEWSSOURCE = "irui" RETRY = 3 FILE = "irui_download.pdf" SOURCE = "艾瑞" URLS = [] CURRENT_PAGE = 1 Nocontents = [] columns = [ { "column": "business", "max": 1 }, # {"column": "company", "max": 1},
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db, util, extract, url_helper, json, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util # logger loghelper.init_logger("crawler_sspai_news", stream=True) logger = loghelper.get_logger("crawler_sspai_news") # mongo # mongo = db.connect_mongo() # collection_news = mongo.article.news MAX_PAGE_ALL = 50 CURRENT_PAGE = 0 SOURCE = 13814 TYPE = 60001 class SspaiCrawler(BaseCrawler.BaseCrawler): def __init__(self):
import traceback from kafka import (KafkaConsumer, KafkaClient, SimpleProducer) import process_topic_message import process_topic_company import process_company_message import process_investor_message reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, config #logger loghelper.init_logger("process_message", stream=True) logger = loghelper.get_logger("process_message") # kafka kafkaConsumer = None kafkaProducer = None def init_kafka(): global kafkaConsumer global kafkaProducer (url) = config.get_kafka_config() # HashedPartitioner is default kafkaConsumer = KafkaConsumer("track_message_v2", group_id="process_topic_message", bootstrap_servers=[url],
# -*- coding: utf-8 -*- import os, sys import time reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, util, db, config, name_helper #logger loghelper.init_logger("artifact_correct", stream=True) logger = loghelper.get_logger("artifact_correct") def weibo(): id = -1 conn = db.connect_torndb() while True: ars = conn.query( "select * from artifact where type=4030 and verify is null and id>%s order by id limit 1000", id) if len(ars) == 0: break for a in ars: artifact_id = a["id"] # logger.info(a["link"]) if artifact_id > id: id = artifact_id link = a["link"]
from gevent import monkey; monkey.patch_all() from pymongo import MongoClient import pymongo reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db, util #logger loghelper.init_logger("itunes_aso100", stream=True) logger = loghelper.get_logger("itunes_aso100") linkPattern = "/app/rank/appid/\d+" #不收录图书,游戏, 报刊杂志 cates = [ {"name":"商务", "url": "6000"}, {"name":"商品指南", "url": "6022"}, {"name":"教育", "url": "6017"}, {"name":"娱乐", "url": "6016"}, {"name":"财务", "url": "6015"}, {"name":"美食佳饮", "url": "6023"}, {"name":"健康健美", "url": "6013"}, {"name":"生活", "url": "6012"}, {"name":"医疗", "url": "6020"}, {"name":"音乐", "url": "6011"},
import pymongo from distutils.version import LooseVersion import urllib reload(sys) sys.setdefaultencoding("utf-8") sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append(os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, db, util, name_helper import android #logger loghelper.init_logger("baidu", stream=True) logger = loghelper.get_logger("baidu") #mongo mongo = db.connect_mongo() collection_index = mongo.market.baidu_index collection_search = mongo.market.baidu_search collection_market = mongo.market.android_market NAMES = [] class BaiduSearchCrawler(BaseCrawler.BaseCrawler): def __init__(self): BaseCrawler.BaseCrawler.__init__(self) def is_crawl_success(self,url, content):
sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util')) import parser_db_util sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../support')) import proxy_pool #logger loghelper.init_logger("crawler_jd", stream=True) logger = loghelper.get_logger("crawler_jd") cnt = 0 SJ = [] #mongo mongo = db.connect_mongo() collection_job = mongo.job.job collection_company = mongo.job.company class LagouCrawler(BaseCrawler.BaseCrawler): def __init__(self, timeout=20): BaseCrawler.BaseCrawler.__init__(self, timeout=timeout) # 实现
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_jingmeiti_news", stream=True) logger = loghelper.get_logger("crawler_jingmeiti_news") NEWSSOURCE = "jingmeiti" RETRY = 3 TYPE = 60001 SOURCE = 13843 URLS = [] CURRENT_PAGE = 1 linkPattern = "www.jingmeiti.com/archives/\d+" Nocontents = [] columns = [ { "column": "new", "max": 3 },
sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../util')) import loghelper, extract, db, util, url_helper, download, extractArticlePublishedDate sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util #logger loghelper.init_logger("crawler_baidu", stream=True) logger = loghelper.get_logger("crawler_baidu") class NewsDownloader: def __init__(self, TYPE=60001, SOURCE=13080, RETRY=20, CATEGORY=None, FORCE=False): self.TYPE = TYPE self.SOURCE = SOURCE self.RETRY = RETRY self.CATEGORY = CATEGORY self.FORCE = FORCE
os.path.join(os.path.split(os.path.realpath(__file__))[0], '..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, extract, db, util, url_helper, download sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../parser/util2')) import parser_mysql_util import parser_mongo_util #logger loghelper.init_logger("crawler_ifanr_news", stream=True) logger = loghelper.get_logger("crawler_ifanr_news") #mongo mongo = db.connect_mongo() collection_news = mongo.article.news newsid = [] b_id = "" SOURCE = 13811 TYPE = 60001 dt = datetime.date.today() columns = [ {
sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../../util')) import GlobalValues, crawler_util sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../..')) import BaseCrawler sys.path.append( os.path.join( os.path.split(os.path.realpath(__file__))[0], '../../../../util')) import loghelper, util, db #logger loghelper.init_logger("crawler_lagou_company", stream=True) logger = loghelper.get_logger("crawler_lagou_company") cnt = 0 class LagouCrawler(BaseCrawler.BaseCrawler): def __init__(self, timeout=15): BaseCrawler.BaseCrawler.__init__(self, timeout=timeout) # 实现 def is_crawl_success(self, url, content): if content.find("</html>") == -1: return False d = pq(html.fromstring(content.decode("utf-8"))) title = d('head> title').text().strip()