示例#1
0
import requests
from cli.log import get_log
from cli.proxy import IpPool
import json
import datetime
from lxml import etree

logger = get_log('http')

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
    # 'cookie': "my_mbcookie=9381740248; gr_user_id=aadb4e55-834c-4b15-9629-0ad5b46be83f; grwng_uid=8afb463c-2554-482a-b175-584868f76b76; _ga=GA1.2.1853337866.1539237768; deny_ip=UWMHa1ViACtWYFBoVGAHLgBvAzJReAg0VmUGMw%3D%3D; g_step_tmp=1; _pk_ref.2.9731=%5B%22%22%2C%22%22%2C1541396804%2C%22http%3A%2F%2Fwww.molbase.com%2Fen%2F488-93-7-moldata-179475.html%22%5D; _pk_ses.2.9731=*; ad747e1972c640de_gr_session_id=7e1a2bb0-a122-4b3f-a364-e2ecd53690c0; _gid=GA1.2.1927091434.1541396804; ad747e1972c640de_gr_session_id_7e1a2bb0-a122-4b3f-a364-e2ecd53690c0=true; current_user_key=689615e92a91e9b63ff65108985e2782; count_views_key=113c494d84d1f9713d661963f783d079; ECM_ID=rf5ko34u1lcn6vn2vd67s0t061; ECM_ID=rf5ko34u1lcn6vn2vd67s0t061; Hm_lvt_16ee3e47bd5e54a79fa2659fe457ff1e=1539237692,1539323463,1541127887,1541396811; _pk_id.2.9731=b06a2f06be918374.1539237692.7.1541399717.1541396804.; Hm_lpvt_16ee3e47bd5e54a79fa2659fe457ff1e=1541399717; lighting=eyJpdiI6IjFJNnJQUTNuUjh0TzQ3WFZcL1ZlOG13PT0iLCJ2YWx1ZSI6IlhYK1UyVW50ekx6SzVnTWlScXkxbzUwTEJCOW1Eb1BtdEIxTXRaWnE1SzR6RTNrM1JJMXRcL0tpRCtKSmgxaHptNTB2VTdTTnl5OFZqOTZ6V05INDJSZz09IiwibWFjIjoiMTIxYTRhZWJiZjJlYjAyYzg1MmFjNzUzZmZiODg1OTJlOTE1NGI2YzZkMzYxNmY1MGRlMTU4NTg5Y2ViZmQwZiJ9",
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Host': 'www.molbase.com',
    'Accept-Encoding': 'gzip, deflate'
}

ips = '52.79.116.117:3128'


def get_proxy_ips():
    # global ips
    # if ips:
    #     resp = requests.get('http://10.9.60.13:5010/delete/?proxy=%s' % ips, headers)
    # resp = requests.get('http://10.9.60.13:5010/get', headers)
    while True:
        try:
            resp = requests.get('http://123.207.35.36:5010/get', headers)
示例#2
0
import requests
import json
from queue import Queue
from selenium import webdriver
from requests.exceptions import ConnectionError, ProxyError
# import pytesseract
# from PIL import Image
from cli.log import get_log

log = get_log("http")

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
    # 'cookie': "my_mbcookie=9381740248; gr_user_id=aadb4e55-834c-4b15-9629-0ad5b46be83f; grwng_uid=8afb463c-2554-482a-b175-584868f76b76; _ga=GA1.2.1853337866.1539237768; deny_ip=UWMHa1ViACtWYFBoVGAHLgBvAzJReAg0VmUGMw%3D%3D; g_step_tmp=1; _pk_ref.2.9731=%5B%22%22%2C%22%22%2C1541396804%2C%22http%3A%2F%2Fwww.molbase.com%2Fen%2F488-93-7-moldata-179475.html%22%5D; _pk_ses.2.9731=*; ad747e1972c640de_gr_session_id=7e1a2bb0-a122-4b3f-a364-e2ecd53690c0; _gid=GA1.2.1927091434.1541396804; ad747e1972c640de_gr_session_id_7e1a2bb0-a122-4b3f-a364-e2ecd53690c0=true; current_user_key=689615e92a91e9b63ff65108985e2782; count_views_key=113c494d84d1f9713d661963f783d079; ECM_ID=rf5ko34u1lcn6vn2vd67s0t061; ECM_ID=rf5ko34u1lcn6vn2vd67s0t061; Hm_lvt_16ee3e47bd5e54a79fa2659fe457ff1e=1539237692,1539323463,1541127887,1541396811; _pk_id.2.9731=b06a2f06be918374.1539237692.7.1541399717.1541396804.; Hm_lpvt_16ee3e47bd5e54a79fa2659fe457ff1e=1541399717; lighting=eyJpdiI6IjFJNnJQUTNuUjh0TzQ3WFZcL1ZlOG13PT0iLCJ2YWx1ZSI6IlhYK1UyVW50ekx6SzVnTWlScXkxbzUwTEJCOW1Eb1BtdEIxTXRaWnE1SzR6RTNrM1JJMXRcL0tpRCtKSmgxaHptNTB2VTdTTnl5OFZqOTZ6V05INDJSZz09IiwibWFjIjoiMTIxYTRhZWJiZjJlYjAyYzg1MmFjNzUzZmZiODg1OTJlOTE1NGI2YzZkMzYxNmY1MGRlMTU4NTg5Y2ViZmQwZiJ9",
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Host': 'www.molbase.com',
    'Accept-Encoding': 'gzip, deflate'
}

import datetime


def get_proxy_ip():
    resp = requests.get(
        # 5
        'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
        # 20
        # 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&time=2'
示例#3
0
 def __init__(self, size, chart):
     # self.queue = Queue(size)
     self.size = size
     self.chart = chart
     self.isRun = False
     self.log = get_log(chart)
示例#4
0
from cli.cli import get_data, change_ips
from db.db import db_en_cache, en_olbase_err, mongo, db_en_olbase, DuplicateKeyError, repeat
from core.parse import parses, filter_ele
import uuid
from multiprocessing import Process, Queue
from threading import Thread
import random
from queue import Empty
from cli.log import get_log
import time

log = get_log('engin')


def run(size):
    queues = [Queue() for i in range(size)]
    p_list = [Process(target=process, args=(queue, )) for queue in queues]
    for p in p_list:
        p.start()
    db = db_en_cache.find()
    count = 0
    while True:
        try:
            url = db.next()['url']
            count += 1
            # if url not in self.url_pool:
            res = db_en_olbase.find_one({'url': url})
            if not res:
                res = repeat().find_one({'url': url})
                if not res:
                    queue = random.choice(queues)
示例#5
0
from cli.cli import get_data, change_ips
from db.db import db_en_cache, db_en_olbase
from lxml import etree
import re
from threading import Thread
import pickle
import os
from cli.log import get_log

log = get_log('link_list')


def process(queue):
    pass


def run(size):
    url_pool = []
    db = db_en_cache.find()
    count = 0
    while True:
        try:
            if len(url_pool) < size:
                url = db.next()['url']
                count += 1
                # if url not in self.url_pool:
                res = db_en_olbase.find_one({'url': url})
                if not res:
                    url_pool.append(url)
            else:
                process(url_pool)