Пример #1
0
def crawler():
    # Establishing connection with mongodb
    client = pymongo.MongoClient(config["localhost"], config["port_num"])
    db = client[config['database_name']]
    col = db[config['collection_name']]

    # starting scraping
    if col.count_documents(
        {}) == 0:  # if collection is empty : scrape flinkhub.com
        links_list1 = []
        headers1 = {
            'User-Agent': config['user_agent'],
        }
        try:  # send request
            logger.debug("Making HTTP GET request: " + config['host_name'])
            r1 = requests.get(config['host_name'], headers=headers1)
            res1 = r1.text
            logger.debug("Got HTML source, content length = " + str(len(res1)))

        except:  # if cannot request url
            logger.exception("Failed to get HTML source from " +
                             config['host_name'])
            traceback.print_exc()
            return links_list1

        logger.debug("Extracting links from the HTML")
        soup1 = BeautifulSoup(
            res1, 'html.parser')  # converting request to soup object

        # saving html content to a .txt file
        try:
            file_name1 = ''.join(
                random.choices(string.ascii_uppercase + string.digits, k=16))
            file_name1 = file_name1 + '.txt'
            file_path1 = os.path.join(os.getcwd(), config["file_dir"],
                                      file_name1)
            text_file1 = open(file_path1, "w")
            n1 = text_file1.write(str(soup1))
            text_file1.close()
        except:
            logger.exception("Cannot write link in a file.")

        if 'Content-Length' in r1.headers:
            new_doc = {
                "link": config["host_name"],
                "source_link": None,
                "is_crawled": True,
                "last_crawl_date": datetime.datetime.utcnow(),
                "response_status": r1.status_code,
                "content_type": r1.headers['Content-Type'],
                "con_length": r1.headers['Content-Length'],
                "file_path": file_path1,
                "created_at": datetime.datetime.utcnow(),
            }
        else:
            new_doc = {
                "link": config["host_name"],
                "source_link": None,
                "is_crawled": True,
                "last_crawl_date": datetime.datetime.utcnow(),
                "response_status": r1.status_code,
                "content_type": r1.headers['Content-Type'],
                "con_length": len(r1.content),
                "file_path": file_path1,
                "created_at": datetime.datetime.utcnow(),
            }
        col.insert_one(new_doc)  # inserting original link to a document

        links1 = soup1.find_all("a")  # finding all a tags

        for link1 in links1:  # iterating over all the links
            temp1 = link1.get('href')  # getting url
            if temp1 not in links_list1:  # if link was not scraped in the same cycle
                links_list1.append(temp1)

                # checking validity of link
                temp_parse1 = urllib.parse.urlparse(temp1)
                netloc_bool1 = bool(temp_parse1.netloc)
                scheme_bool1 = bool(temp_parse1.scheme)
                if netloc_bool1:
                    if scheme_bool1:
                        # if link is valid and absolute url
                        actual_link1 = temp1
                        query1 = {"link": actual_link1}
                        myq1 = col.find(query1)
                        if myq1.count == 0:
                            temp_doc1 = {
                                "link": actual_link1,
                                "source_link": config['host_name'],
                                "is_crawled": False,
                                "last_crawl_date": None,
                                "response_status": None,
                                "content_type": None,
                                "con_length": None,
                                "file_path": None,
                                "created_at": datetime.datetime.utcnow()
                            }
                            col.insert_one(
                                temp_doc1)  # adding link to a document
                        else:
                            print(temp1 + " already exists in database."
                                  )  # if link already exists in database
                    else:
                        print(temp1 +
                              " link not valid")  # if link is not valid
                else:
                    # if link is a relative url
                    actual_link2 = urllib.parse.urljoin(
                        config['host_name'], temp1)
                    netloc_bool2 = bool(urllib.parse.urlparse(actual_link2))
                    scheme_bool2 = bool(urllib.parse.urlparse(actual_link2))
                    if netloc_bool2 and scheme_bool2:  # if relative url is valid
                        query2 = {"link": actual_link2}
                        if col.count_documents(
                                query2
                        ) == 0:  # if link doesn't exist in collection already
                            temp_doc1 = {
                                "link": actual_link2,
                                "source_link": config['host_name'],
                                "is_crawled": False,
                                "last_crawl_date": None,
                                "response_status": None,
                                "content_type": None,
                                "con_length": None,
                                "file_path": None,
                                "created_at": datetime.datetime.utcnow()
                            }
                            col.insert_one(
                                temp_doc1)  # inserting link to collection
                        else:
                            print(str(actual_link2) + " already exists."
                                  )  # if link already exists in collection
                    else:
                        print(actual_link2 +
                              " not valid")  # if link is not valid
            else:
                print(temp1 + " Link already scraped"
                      )  # if link is already scraped in the same cycle
        return links1

    else:  # if there exist some links in the collection already
        if col.count_documents({
                "is_crawled": False
        }) > 0:  # if there exist some documents which are not crawled

            # picking a random link from the collection to be scraped
            num1 = col.count_documents({"is_crawled": False})
            random1 = math.floor(random.random() * num1)
            cursor_doc = col.find({"is_crawled": False}).limit(1).skip(random1)
            for curs in cursor_doc:
                doc = curs
            links_list2 = []
            og_link = doc['link']
            headers2 = {
                'User-Agent': config['user_agent'],
            }
            try:  # requesting link
                logger.debug("Making HTTP GET request: " + og_link)
                r2 = requests.get(og_link, headers=headers2)
                res2 = r2.text
                logger.debug("Got HTML source, content length = " +
                             str(len(res2)))
            except:
                logger.exception("Failed to get HTML source from " + og_link)
                traceback.print_exc()
                return links_list2

            logger.debug("Extracting links from the HTML")
            soup2 = BeautifulSoup(
                res2, 'html.parser')  # converting request to a soup object
            # saving html content to a file
            try:
                file_name2 = ''.join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=16))
                file_name2 = file_name2 + '.txt'
                file_path_2 = os.path.join(os.getcwd(), config['file_dir'],
                                           file_name2)
                text_file2 = open(file_path_2, "w")
                n2 = text_file2.write(str(soup2))
                text_file2.close()
            except:
                logger.exception("Cannot write link in a file.")

            if 'Content-Length' in r2.headers:
                updated_doc = {
                    "is_crawled": True,
                    "last_crawl_date": datetime.datetime.utcnow(),
                    "response_status": r2.status_code,
                    "content_type": r2.headers['Content-Type'],
                    "con_length": r2.headers['Content-Length'],
                    "file_path": file_path_2,
                }
            else:
                updated_doc = {
                    "is_crawled": True,
                    "last_crawl_date": datetime.datetime.utcnow(),
                    "response_status": r2.status_code,
                    "content_type": r2.headers['Content-Type'],
                    "con_length": len(r2.content),
                    "file_path": file_path_2,
                }

            col.update_one(
                doc,
                {"$set": updated_doc})  # updating link which was just scraped

            links2 = soup2.find_all("a")  # converting request to a soup object
            for link2 in links2:
                temp2 = link2.get('href')  # getting link from a tag
                if temp2 not in links_list2:  # itertaing through links
                    links_list2.append(temp2)

                    # checking validity of links
                    temp_parse3 = urllib.parse.urlparse(temp2)
                    netloc_bool3 = bool(temp_parse3.netloc)
                    scheme_bool3 = bool(temp_parse3.scheme)
                    if netloc_bool3:
                        if scheme_bool3:
                            # valid absolute link
                            actual_link3 = temp2
                            query3 = {"link": actual_link3}
                            if col.count_documents(query3) == 0:
                                temp_doc = {
                                    "link": actual_link3,
                                    "source_link": og_link,
                                    "is_crawled": False,
                                    "last_crawl_date": None,
                                    "response_status": None,
                                    "content_type": None,
                                    "con_length": None,
                                    "file_path": None,
                                    "created_at": datetime.datetime.utcnow()
                                }
                                col.insert_one(
                                    temp_doc)  # adding link to the collection
                            else:
                                print(temp2 + " already exists."
                                      )  # if link already exists in collection
                        else:  # if link is not valid
                            print(temp2 + " link not valid")
                    else:
                        # link is a relative link
                        actual_link4 = urllib.parse.urljoin(og_link, temp2)
                        netloc_bool4 = bool(
                            urllib.parse.urlparse(actual_link4))
                        scheme_bool4 = bool(
                            urllib.parse.urlparse(actual_link4))
                        if netloc_bool4 and scheme_bool4:
                            # valid relative link
                            query4 = {"link": actual_link4}
                            if col.count_documents(
                                    query4
                            ) == 0:  # checking for existence of link in collection
                                temp_doc = {
                                    "link": actual_link4,
                                    "source_link": og_link,
                                    "is_crawled": False,
                                    "last_crawl_date": None,
                                    "response_status": None,
                                    "content_type": None,
                                    "con_length": None,
                                    "file_path": None,
                                    "created_at": datetime.datetime.utcnow()
                                }
                                col.insert_one(
                                    temp_doc)  # adding link to the collection
                            else:
                                print(actual_link4 + " already exists."
                                      )  # link already exists in collection
                        else:
                            print(actual_link4 +
                                  " not valid")  # link is not valid
            return links2  # return list of links found

        else:  # if there are no links which are not crawled yet
            valid_docs = col.find({})
            # finding links which were not crawled in last 24 hours
            time_dif = datetime.timedelta(days=1)
            greater_than_24_docs = []
            for single_doc in valid_docs:
                if single_doc["last_crawl_date"] > time_dif:
                    greater_than_24_docs.append(single_doc)
            num2 = len(greater_than_24_docs)
            # picking a random link out of those links which were not crawled in last 24 hours
            random2 = random.randint(0, num2 - 1)
            doc = greater_than_24_docs[random2]
            links_list2 = []
            og_link = doc.link
            headers2 = {
                'User-Agent': config['user_agent'],
            }
            # making a https request
            try:
                logger.debug("Making HTTP GET request: " + og_link)
                r2 = requests.get(og_link, headers=headers2)
                res2 = r2.text
                logger.debug("Got HTML source, content length = " +
                             str(len(res2)))
            except:
                logger.exception("Failed to get HTML source from " + og_link)
                traceback.print_exc()
                return links_list2

            logger.debug("Extracting links from the HTML")
            soup2 = BeautifulSoup(
                res2, 'html.parser')  # turning request into soup object

            try:
                # writing html content to a txt file
                file_name2 = ''.join(
                    random.choices(string.ascii_uppercase + string.digits,
                                   k=16))
                file_name2 = file_name2 + '.txt'
                file_path2 = os.path.join(os.getcwd(), config['file_dir'],
                                          file_name2)
                text_file2 = open(file_path2, "w")
                n2 = text_file2.write(str(soup2))
                text_file2.close()
            except:
                logger.exception("Cannot write link in a file.")

            if 'Content-Length' in r2.headers:
                updated_doc = {
                    "is_crawled": True,
                    "last_crawl_date": datetime.date.today(),
                    "response_status": r2.status_code,
                    "content_type": r2.headers['Content-Type'],
                    "con_length": r2.headers['Content-Length'],
                    "file_path": file_path2,
                }

            else:

                updated_doc = {
                    "is_crawled": True,
                    "last_crawl_date": datetime.date.today(),
                    "response_status": r2.status_code,
                    "content_type": r2.headers['Content-Type'],
                    "con_length": len(r2.content),
                    "file_path": file_path2,
                }

            col.update_one(doc,
                           {"$set": updated_doc
                            })  # updating the recently crawled link document

            links2 = soup2.find_all("a")  # finding all anchor tags
            for link2 in links2:  # iterating through a tags
                temp2 = link2.get('href')  # geting the link from a tag
                if temp2 not in links_list2:  # if link wasn't found in this cycle
                    links_list2.append(temp2)
                    # checking for validity of link
                    temp_parse3 = urllib.parse.urlparse(temp2)
                    netloc_bool3 = bool(temp_parse3.netloc)
                    scheme_bool3 = bool(temp_parse3.scheme)
                    if netloc_bool3:
                        if scheme_bool3:
                            # link is absolute url and valid
                            actual_link3 = temp2
                            query3 = {"link": actual_link3}
                            if col.count_documents(query3) == 0:
                                temp_doc = {
                                    "link": actual_link3,
                                    "source_link": og_link,
                                    "is_crawled": False,
                                    "last_crawl_date": None,
                                    "response_status": None,
                                    "content_type": None,
                                    "con_length": None,
                                    "file_path": None,
                                    "created_at": datetime.datetime.utcnow()
                                }
                                col.insert_one(
                                    temp_doc)  # adding link to the collection
                            else:
                                print(
                                    temp2 + " already exists."
                                )  # if link already exists in the collection
                        else:
                            print(temp2 +
                                  " link not valid")  # link is not valid
                    else:
                        # link is a relative link
                        actual_link4 = urllib.parse.urljoin(og_link, temp2)
                        netloc_bool4 = bool(
                            urllib.parse.urlparse(actual_link4))
                        scheme_bool4 = bool(
                            urllib.parse.urlparse(actual_link4))
                        if netloc_bool4 and scheme_bool4:
                            # link is relative and valid
                            query4 = {"link": actual_link4}
                            if col.count_documents(query4) == 0:
                                temp_doc = {
                                    "link": actual_link4,
                                    "source_link": og_link,
                                    "is_crawled": False,
                                    "last_crawl_date": None,
                                    "response_status": None,
                                    "content_type": None,
                                    "con_length": None,
                                    "file_path": None,
                                    "created_at": datetime.datetime.utcnow()
                                }
                                col.insert_one(
                                    temp_doc
                                )  # adding link document to collection
                            else:
                                print(actual_link4 + " already exists."
                                      )  # link already exists in collection
                        else:
                            print(actual_link4 +
                                  " not valid")  # link is not valid
            return link2
Пример #2
0
'''

1. Install MongoDB Enterprise Server 4.2.5
   https://www.mongodb.com/download-center/enterprise

   MongoDB Compass installed as well
   
2. pip3 install pymongo

3. Add MongoDB server to PATH
   Control Panel\System and Security\System
   Advance system settings
   Environment Variables
   User variables: Path
   Add C:\Program Files\MongoDB\Server\4.2\bin

4. MongoDB Server should be running

5. Enter MongoDB with mongo command

'''

import pymongo
from pprint import pprint

client = pymongo.MongoClient('mongodb://127.0.0.1:27017')

with client:
    pprint(dir(client), indent=4)
    print(client.server_info)
Пример #3
0
def config(app):
    """Configure the application"""

    # Sentry (logging)
    if app.config.get('SENTRY_DSN'):

        sentry_logging = LoggingIntegration(
            level=logging.INFO,
            event_level=logging.WARNING
        )

        app.sentry = sentry_sdk.init(
            app.config.get('SENTRY_DSN'),
            integrations=[
                sentry_logging,
                FlaskIntegration(),
                RedisIntegration()
            ]
        )

    # Database (mongo and mongoframes)
    app.mongo = pymongo.MongoClient(app.config['MONGO_URI'])
    app.db = app.mongo.get_default_database()
    mongoframes.Frame._client = app.mongo

    # Database authentication
    if app.config.get('MONGO_PASSWORD'):
        app.db.authenticate(
            app.config.get('MONGO_USERNAME'),
            app.config.get('MONGO_PASSWORD')
            )

    # Database (redis)
    if app.config['REDIS_USE_SENTINEL']:
        sentinel = Sentinel(
            app.config['REDIS_ADDRESS'],
            db=app.config['REDIS_DB'],
            password=app.config['REDIS_PASSWORD'],
            decode_responses=True
        )
        app.redis = sentinel.master_for(app.config['REDIS_SENTINEL_MASTER'])

    else:
        app.redis = StrictRedis(
            host=app.config['REDIS_ADDRESS'][0],
            port=app.config['REDIS_ADDRESS'][1],
            db=app.config['REDIS_DB'],
            password=app.config['REDIS_PASSWORD'],
            decode_responses=True
        )

    # CSRF protection
    forms.CSRF.init_app(app)

    # Manage
    app.manage = manage.Manage(app)

    # Email
    if 'EMAIL_BACKEND' in app.config:
        app.mailer = app.config['EMAIL_BACKEND'].Mailer(
            **app.config.get('EMAIL_BACKEND_SETTINGS')
            )

    # Set the application's default date format for form fields
    forms.fields.DateField.default_format = app.config.get('DATE_FORMAT')

    # Fixes

    # Increase the default cache size for jinja templates
    app.jinja_env.cache = create_cache(1000)

    # REMOTE_ADDR when running behind a proxy server
    app.wsgi_app = ProxyFix(app.wsgi_app)
Пример #4
0
"""

import pymongo
from MaterialPlanning import MaterialPlanning
import time
from dateutil import parser
from utils import required_dctCN, owned_dct, aggregation, collectionCN

CCSeason = 3

aggregation(collectionCN, required_dctCN, "阿米娅")
update_time = parser.parse(time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime()))
print(update_time)
print('正在从企鹅物流获取数据...')
server = open('data/server.txt', 'r').readline().strip()
dbclient = pymongo.MongoClient(server)
db = dbclient['Arknights_OneGraph']

Filter_special_items = ['荒芜行动物资补给', '罗德岛物资补给', '岁过华灯', '32h战略配给', '感谢庆典物资补给']
Filter_special_stages = ['S4-4', 'S6-4', 'S4-9']

# Calculation for CN server
collection = db['Material_Event']
Event_Stages = ['FA-%d' % x for x in range(1, 9)]
mp_event = MaterialPlanning(filter_stages=Filter_special_stages +
                            Filter_special_items,
                            filter_freq=100,
                            update=False,
                            printSetting='000011101111',
                            CCSeason=CCSeason)
mp_event.get_plan(required_dctCN,
Пример #5
0
 def __init__(self):
     self.client = pymongo.MongoClient(DB_CONFIG['DB_CONNECT_STRING'],
                                       connect=False)
Пример #6
0
 def get_proddata(self,url):
     config=self.config
     pgid =1
     client = pymongo.MongoClient(config["mongolink"])
     db = client['pharmascrape']
     nins=0
     self.logger.info("Mega-category:"+config['Mega-category'])
     self.logger.info("Category:"+config['Category'])
     self.logger.info("segment:"+config['segment'])
     self.logger.info("Sub-segment:"+config['Sub-segment'])
     run = True
     while (run):
         soup = self.get_soup(url+"/"+str(pgid))
         prods=soup.find_all('div', {"class":"product mb-5"})
         self.logger.info("#Found products:" + str(len(prods)))
         for prod in prods:
             try:
                 proddict=dict()
                 proddict['Source']=config['site']
                 proddict['Mega-category']=config['Mega-category']
                 proddict['Category']=config['Category']
                 proddict['segment']=config['segment']
                 proddict['Sub-segment']=config['Sub-segment']
                 proddict['template']=config['template']
                 try:
                     proddict['urltoproduct']=config['site']+prod.find("h2",{"class":"product-title"}).find("a")['href']
                 except Exception as e:
                     self.logger.error("Line 94:"+str(e))
                     proddict['urltoproduct']="None"
                 try:
                     proddict['Product_name']=prod.find("h2",{"class":"product-title"}).find("a")['title']
                     if (db['scrapes'].find({"Source":config['site'],"Product_name":proddict['Product_name']}).count()>0):
                         continue
                 except Exception as e:
                     self.logger.error("Line 99:"+str(e))
                     proddict['Product_name']="None"
                 try:
                     proddict['Price'] = float(prod.find("span",{"class":"price"}).text.replace("\n","").replace("TTC","").replace("\xa0€","").replace(",",".").strip())
                 except Exception as e:
                     self.logger.error("Line 133:"+str(e))
                     proddict['Price'] = "None"
                 try:
                     proddict["Brand"] = prod.find("h3",{"class":"product-subtitle"}).text.strip()
                 except Exception as e: 
                     self.logger.error("Line 106:"+str(e))
                     proddict["Brand"] = "None"
                 try:
                     proddict['Crossed_out_Price'] = float(prod.find("s",{"class":"text-promo"}).text.replace("\n","").replace("TTC","").replace("\xa0€","").replace(",",".").strip())
                 except Exception as e:
                     self.logger.error("Line 133:"+str(e))
                     proddict['Crossed_out_Price'] = "None" 
                 try:
                     proddict["Promotional_claim"] = prod.find("p",{"class":"bg-promo p-2 mb-2 text-center text-promo text-uppercase"}).text.strip()
                 except Exception as e: 
                     self.logger.error("Line 146:"+str(e))
                     proddict["Promotional_claim"] = "None"
                 try:
                     proddict['Imagelink'] = prod.find("img")['src']
                     proddict['Imagefilename'] = proddict['Imagelink'].split("/")[len(proddict['Imagelink'].split("/"))-1]
                 except Exception as e:
                     self.logger.error("Line 138:"+str(e))
                     proddict["Imagelink"]="None"
                     proddict["Imagefilename"]="None"
                 db['scrapes'].insert_one(proddict)
                 nins=nins+1
                 self.logger.info("#insertions:" + str(nins))
             except Exception as e:
                 self.logger.info("soup:" + str(prod))
                 self.logger.error("Line 87:" + str(e))
                 continue
         run = self.is_product(url+"/"+str(pgid))
         pgid =pgid+1
     client.close()
     pass
Пример #7
0
 def __init__(self):
     self.myclient = pymongo.MongoClient("mongodb://localhost:27017/")
     self.mydb = self.myclient["proxies"]
     self.mycol = self.mydb["proxy"]
Пример #8
0
import csv
import time
import pandas as pd
import pymongo
import configparser

config = configparser.ConfigParser()
config.read('config.ini')

try:
    client = pymongo.MongoClient(config["DEFAULT"]["MONGO_URL"], ssl_ca_certs='./cert.pem', connectTimeoutMS=30000, socketTimeoutMS=None)
    db = client.main
    collection = db.airdrop
    client.server_info()
except pymongo.errors.ServerSelectionTimeoutError as err:
    print(err)

allocated = 0
supply = 3

# Change allocation to zero for all items so they can be reset
post_id = collection.update({"supply": supply}, {"$set": {"allocation": 0}}, upsert=True, multi=True)
print('Updated '+str(post_id))

with open('../data/advisors.csv', 'rb') as csvfile:
    addresses = pd.read_csv(csvfile)
    for index, row in addresses.iterrows():
        # Check that address is valid
        try:
            allocation = int(row.allocation)
        except:
Пример #9
0
# Flask Setup
app = Flask(__name__)

# Database Setup
# The database URI
app.config['SQLALCHEMY_DATABASE_URI'] = "sqlite:///db/aqi.sqlite"
db = SQLAlchemy(app)

# Create connection variable
# conn = 'mongodb://*****:*****@ds133202.mlab.com:33202/trafficaq'
conn = 'mongodb://*****:*****@ds233452.mlab.com:33452/trafficaq'

# Pass connection to the pymongo instance.
client = pymongo.MongoClient(conn)

# Connect to a database. Will create one if not already available.
# db = client.traffic_db
mdb = client.trafficaq

# Drops collection if available to remove duplicates
mdb.trafficAQ.drop()


class AQI(db.Model):
    __tablename__ = 'aqi'

    id = db.Column(db.Integer, primary_key=True)
    Latitude = db.Column(db.String)
    Longitude = db.Column(db.String)
Пример #10
0
 def open_spider(self, spider):
     self.client = pymongo.MongoClient(self.mongo_uri)
     self.db = self.client[self.mongo_db]
     self.db.authenticate(self.mongo_user,self.mongo_pass)
Пример #11
0
 def initializing_main_db():
     client = pymongo.MongoClient(Database.uri)
     Database.db = client['blog']
Пример #12
0
 def __init__(self, mongo_uri, mongo_db):
     self.client = pymongo.MongoClient(mongo_uri)
     self.db = self.client[mongo_db]
     self.db.user.remove()
Пример #13
0
import json
import time
from _datetime import datetime
from urllib.parse import urlencode

import pymongo
import scrapy
from pandas import DataFrame

connection = pymongo.MongoClient('192.168.2.149', 27017)
db = connection["chaboshi"]
collection = db["chaboshi_car"]
model_data = collection.find({}, {"vehicle_id": 1, "maxRegYear": 1, "minRegYear": 1, "_id": 0})

car_msg_list = list(model_data)
car_msg_df = DataFrame(car_msg_list)
car_msg_df_new = car_msg_df.drop_duplicates('vehicle_id')


class ChaboshiGzSpider(scrapy.Spider):
    name = 'chaboshi_gz'
    allowed_domains = ['chaboshi.cn']

    # start_urls = ['http://chaboshi.cn/']

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(
            getattr(cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False) else 'custom_settings', None) or {},
            priority='spider')
Пример #14
0
            print("badItems ",self.badItems) 
        
def fetch(apps):
    for app in apps:
        queue = threadQueue(numOfThreads,app)
        queue.waitForStop()
        print(app["id"],"finished")
    badItems = queue.badItems
    
    #错页重爬
    for app in badItems:
        queue = threadQueue(numOfThreads,app)
        queue.waitForStop()
    return queue.badItems

#mongodb连接
client = pymongo.MongoClient('mongodb://*****:*****@***.***.***.***:27017/steam_db')
db = client.steam_db
regions = db.China
collection = regions.reviews

requests.packages.urllib3.disable_warnings()
session = requests.session()

appInfos = getAllApps()
# print(appInfos)
numOfThreads = 1
badPages = fetch(appInfos)
print("all finished")

# http://store.steampowered.com/appreviews/243470?json=1&filter=all&language=all&day_range=360&cursor=*&review_type=all&purchase_type=all&num_per_page=10
Пример #15
0
    def run(self):
        app = Flask(__name__)
        csrf = CSRFProtect()
        SECRET_KEY = os.urandom(32)
        app.config['SECRET_KEY'] = SECRET_KEY

        csrf.init_app(app)

        c = pymongo.MongoClient()
        mc = pymongo.MongoClient(os.environ['FEDN_MONGO_HOST'], int(os.environ['FEDN_MONGO_PORT']), username=os.environ['FEDN_MONGO_USER'],
                                 password=os.environ['FEDN_MONGO_PASSWORD'])
        mdb = mc[os.environ['ALLIANCE_UID']]
        alliance = mdb["status"]

        @app.route('/')
        def index():
            # logs_fancy = str()
            # for log in self.logs:
            #    logs_fancy += "<p>" + log + "</p>\n"
            client = self.name
            state = ReducerStateToString(self.control.state())
            logs = None
            refresh = True
            return render_template('index.html', client=client, state=state, logs=logs, refresh=refresh,
                                   dashboardhost=os.environ["FEDN_DASHBOARD_HOST"],
                                   dashboardport=os.environ["FEDN_DASHBOARD_PORT"])

        # http://localhost:8090/add?name=combiner&address=combiner&port=12080&token=e9a3cb4c5eaff546eec33ff68a7fbe232b68a192
        @app.route('/add')
        def add():
            # TODO check for get variables
            name = request.args.get('name', None)
            address = request.args.get('address', None)
            port = request.args.get('port', None)
            # token = request.args.get('token')
            # TODO do validation

            if port is None or address is None or name is None:
                return "Please specify correct parameters."

            certificate, key = self.certificate_manager.get_or_create(address).get_keypair_raw()
            import base64
            cert_b64 = base64.b64encode(certificate)
            key_b64 = base64.b64encode(key)

            # TODO append and redirect to index.
            import copy
            combiner = CombinerInterface(self, name, address, port, copy.deepcopy(certificate), copy.deepcopy(key))
            self.control.add(combiner)

            ret = {'status': 'added', 'certificate': str(cert_b64).split('\'')[1],
                   'key': str(key_b64).split('\'')[1]}  # TODO remove ugly string hack
            return jsonify(ret)

        @app.route('/seed', methods=['GET', 'POST'])
        def seed():
            if request.method == 'POST':
                # upload seed file
                uploaded_seed = request.files['seed']
                if uploaded_seed:
                    self.control.commit(uploaded_seed.filename, uploaded_seed)
            else:
                h_latest_model_id = self.control.get_latest_model()
                model_info = self.control.get_model_info()
                return render_template('index.html', h_latest_model_id=h_latest_model_id, seed=True, model_info=model_info)

            seed = True
            return redirect(url_for('seed', seed=seed))

        # http://localhost:8090/start?rounds=4&model_id=879fa112-c861-4cb1-a25d-775153e5b548
        @app.route('/start', methods=['GET', 'POST'])
        def start():

            if request.method == 'POST':
                timeout = request.form.get('timeout', 180)
                rounds = int(request.form.get('rounds', 1))

                task = (request.form.get('task', ''))
                active_clients = request.form.get('active_clients', 2)
                clients_required = request.form.get('clients_required', 2)
                clients_requested = request.form.get('clients_requested', 8)

                latest_model_id = self.control.get_latest_model()
                config = {'round_timeout': timeout, 'model_id': latest_model_id,
                          'rounds': rounds, 'active_clients': active_clients, 'clients_required': clients_required,
                          'clients_requested': clients_requested, 'task': task}

                self.control.instruct(config)
                return redirect(url_for('index', message="Sent execution plan."))

            else:
                # Select rounds UI
                rounds = range(1, 100)
                latest_model_id = self.control.get_latest_model()
                return render_template('index.html', round_options=rounds, latest_model_id=latest_model_id)

            client = self.name
            state = ReducerStateToString(self.control.state())
            logs = None
            refresh = False
            return render_template('index.html', client=client, state=state, logs=logs, refresh=refresh)

        @app.route('/assign')
        def assign():
            name = request.args.get('name', None)
            combiner_preferred = request.args.get('combiner', None)
            import uuid
            id = str(uuid.uuid4())

            if combiner_preferred:
                combiner = self.control.find(combiner_preferred)
            else:
                combiner = self.control.find_available_combiner()

            if combiner:
                # certificate, _ = self.certificate_manager.get_or_create(combiner.name).get_keypair_raw()
                import base64
                cert_b64 = base64.b64encode(combiner.certificate)
                response = {'host': combiner.address, 'port': combiner.port,
                            'certificate': str(cert_b64).split('\'')[1]}

                return jsonify(response)
            elif combiner is None:
                abort(404, description="Resource not found")
            # 1.receive client parameters
            # 2. check with available combiners if any clients are needed
            # 3. let client know where to connect.
            return

        @app.route('/infer')
        def infer():
            result = ""
            try:
                self.control.set_model_id()
            except fedn.exceptions.ModelError:
                print("Failed to seed control.")

            return result

        # plot metrics from DB
        def _scalar_metrics(metrics):
            """ Extract all scalar valued metrics from a MODEL_VALIDATON. """

            data = json.loads(metrics['data'])
            data = json.loads(data['data'])

            valid_metrics = []
            for metric, val in data.items():
                # If it can be converted to a float it is a valid, scalar metric
                try:
                    val = float(val)
                    valid_metrics.append(metric)
                except:
                    pass

            return valid_metrics

        @app.route('/plot')
        def plot():
            box = 'box'
            plot = create_plot(box)
            show_plot = True
            return render_template('index.html', show_plot=show_plot, plot=plot)

        def create_plot(feature):
            if feature == 'table':
                return create_table_plot()
            elif feature == 'timeline':
                return create_timeline_plot()
            elif feature == 'ml':
                return create_ml_plot()
            elif feature == 'box':
                return create_box_plot()
            else:
                return 'No plot!'

        @app.route('/plot_type', methods=['GET', 'POST'])
        def change_features():
            feature = request.args['selected']
            graphJSON = create_plot(feature)
            return graphJSON

        def create_table_plot():
            metrics = alliance.find_one({'type': 'MODEL_VALIDATION'})
            if metrics == None:
                fig = go.Figure(data=[])
                fig.update_layout(title_text='No data currently available for mean metrics')
                table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
                return table

            valid_metrics = _scalar_metrics(metrics)
            if valid_metrics == []:
                fig = go.Figure(data=[])
                fig.update_layout(title_text='No scalar metrics found')
                table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
                return table

            all_vals = []
            models = []
            for metric in valid_metrics:
                validations = {}
                for post in alliance.find({'type': 'MODEL_VALIDATION'}):
                    e = json.loads(post['data'])
                    try:
                        validations[e['modelId']].append(float(json.loads(e['data'])[metric]))
                    except KeyError:
                        validations[e['modelId']] = [float(json.loads(e['data'])[metric])]

                vals = []
                models = []
                for model, data in validations.items():
                    vals.append(numpy.mean(data))
                    models.append(model)
                all_vals.append(vals)

            header_vals = valid_metrics
            models.reverse()
            values = [models]
            print(all_vals, flush=True)
            for vals in all_vals:
                vals.reverse()
                values.append(vals)

            fig = go.Figure(data=[go.Table(
                header=dict(values=['Model ID'] + header_vals,
                            line_color='darkslategray',
                            fill_color='lightskyblue',
                            align='left'),

                cells=dict(values=values,  # 2nd column
                           line_color='darkslategray',
                           fill_color='lightcyan',
                           align='left'))
            ])

            fig.update_layout(title_text='Summary: mean metrics')
            table = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
            return table

        def create_timeline_plot():
            trace_data = []
            x = []
            y = []
            base = []
            for p in alliance.find({'type': 'MODEL_UPDATE_REQUEST'}):
                e = json.loads(p['data'])
                cid = e['correlationId']
                for cc in alliance.find({'sender': p['sender'], 'type': 'MODEL_UPDATE'}):
                    da = json.loads(cc['data'])
                    if da['correlationId'] == cid:
                        cp = cc

                cd = json.loads(cp['data'])
                tr = datetime.strptime(e['timestamp'], '%Y-%m-%d %H:%M:%S.%f')
                tu = datetime.strptime(cd['timestamp'], '%Y-%m-%d %H:%M:%S.%f')
                ts = tu - tr
                base.append(tr.timestamp())
                x.append(ts.total_seconds())
                y.append(p['sender']['name'])

            trace_data.append(go.Bar(
                x=x,
                y=y,
                orientation='h',
                base=base,
                marker=dict(color='royalblue'),
                name="Training",
            ))

            x = []
            y = []
            base = []
            for p in alliance.find({'type': 'MODEL_VALIDATION_REQUEST'}):
                e = json.loads(p['data'])
                cid = e['correlationId']
                for cc in alliance.find({'sender': p['sender'], 'type': 'MODEL_VALIDATION'}):
                    da = json.loads(cc['data'])
                    if da['correlationId'] == cid:
                        cp = cc
                cd = json.loads(cp['data'])
                tr = datetime.strptime(e['timestamp'], '%Y-%m-%d %H:%M:%S.%f')
                tu = datetime.strptime(cd['timestamp'], '%Y-%m-%d %H:%M:%S.%f')
                ts = tu - tr
                base.append(tr.timestamp())
                x.append(ts.total_seconds())
                y.append(p['sender']['name'])

            trace_data.append(go.Bar(
                x=x,
                y=y,
                orientation='h',
                base=base,
                marker=dict(color='lightskyblue'),
                name="Validation",
            ))

            layout = go.Layout(
                barmode='stack',
                showlegend=True,
            )

            fig = go.Figure(data=trace_data, layout=layout)
            fig.update_xaxes(title_text='Timestamp')
            fig.update_layout(title_text='Alliance timeline')

            # tab = go.Figure(data=[go.Table(
            #     header=dict(values=['Model updates', 'Model Validations'],
            #                 line_color='darkslategray',
            #                 fill_color='lightskyblue',
            #                 align='left'),
            #     cells=dict(values=[[100, 90, 80, 90],  # 1st column
            #                        [95, 85, 75, 95]],  # 2nd column
            #                line_color='darkslategray',
            #                fill_color='lightcyan',
            #                align='left'))
            # ])
            #
            # tab.update_layout(width=500, height=300)
            # tab.update_layout(title_text='Summary')
            timeline = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
            return timeline

        def create_ml_plot():
            metrics = alliance.find_one({'type': 'MODEL_VALIDATION'})
            if metrics == None:
                fig = go.Figure(data=[])
                fig.update_layout(title_text='No data currently available for Mean Absolute Error')
                ml = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
                return ml

            data = json.loads(metrics['data'])
            data = json.loads(data['data'])
            valid_metrics = []
            for metric, val in data.items():
                # Check if scalar - is this robust ?
                if isinstance(val, float):
                    valid_metrics.append(metric)

            # Assemble a dict with all validations
            validations = {}
            clients = {}

            for post in alliance.find({'type': 'MODEL_VALIDATION'}):
                try:
                    e = json.loads(post['data'])
                    clients[post['sender']['name']].append(json.loads(e['data'])[metric])
                except KeyError:
                    clients[post['sender']['name']] = []

            rounds = []
            traces_data = []

            for c in clients:
                print(clients[c], flush=True)
                traces_data.append(go.Scatter(
                    x=rounds,
                    y=clients[c],
                    name=c
                ))
            fig = go.Figure(traces_data)
            fig.update_xaxes(title_text='Rounds')
            fig.update_yaxes(title_text='MAE', tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
            fig.update_layout(title_text='Mean Absolute Error Plot')
            ml = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
            return ml

        def create_box_plot():
            metrics = alliance.find_one({'type': 'MODEL_VALIDATION'})
            if metrics == None:
                fig = go.Figure(data=[])
                fig.update_layout(title_text='No data currently available for metric distribution over alliance '
                                             'participants')
                box = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
                return box

            valid_metrics = _scalar_metrics(metrics)
            if valid_metrics == []:
                fig = go.Figure(data=[])
                fig.update_layout(title_text='No scalar metrics found')
                box = json.dumps(fig, cls=plotly.utils.PlotlyJSONEncoder)
                return box

            # Just grab the first metric in the list.
            # TODO: Let the user choose, or plot all of them.
            if "accuracy" in valid_metrics:
                metric = "accuracy"
            else:
                metric = valid_metrics[0]
            validations = {}
            for post in alliance.find({'type': 'MODEL_VALIDATION'}):
                e = json.loads(post['data'])
                try:
                    validations[e['modelId']].append(float(json.loads(e['data'])[metric]))
                except KeyError:
                    validations[e['modelId']] = [float(json.loads(e['data'])[metric])]

            box = go.Figure()

            x = []
            y = []
            box_trace = []
            for model_id, acc in validations.items():
                x.append(model_id)
                y.append(numpy.mean([float(i) for i in acc]))
                if len(acc) >= 2:
                    box.add_trace(go.Box(y=acc, name=str(model_id), marker_color="royalblue", showlegend=False))

            rounds = list(range(len(y)))
            box.add_trace(go.Scatter(
                x=x,
                y=y,
                name='Mean'
            ))

            box.update_xaxes(title_text='Model ID')
            box.update_yaxes(tickvals=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0])
            box.update_layout(title_text='Metric distribution over alliance participants: {}'.format(metric))
            box = json.dumps(box, cls=plotly.utils.PlotlyJSONEncoder)
            return box


        # @app.route('/seed')
        # def seed():
        #    try:
        #        result = self.inference.infer(request.args)
        #    except fedn.exceptions.ModelError:
        #        print("no model")
        #
        #    return result

        # import os, sys
        # self._original_stdout = sys.stdout
        # sys.stdout = open(os.devnull, 'w')
        if self.certificate:
            print("trying to connect with certs {} and key {}".format(str(self.certificate.cert_path),
                                                                      str(self.certificate.key_path)), flush=True)
            app.run(host="0.0.0.0", port="8090",
                    ssl_context=(str(self.certificate.cert_path), str(self.certificate.key_path)))
Пример #16
0
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
 @Time : 2019/10/28 14:28
 @Auth : 明明
 @IDE  : PyCharm
 """
# -*- coding: utf-8 -*
import json
import xlwt
import os
import sys
import pymongo
from setting import MONGO_HOST, MONGO_PORT, REDIS_HOST, REDIS_PORT

client = pymongo.MongoClient(MONGO_HOST, MONGO_PORT)


# def readjson():
#     db = client["OTHERS"]
#     collection = db["zhihu_answer"]
#     info_li = []
#     info = collection.find({})
#     j = 0
#     for i in info:
#         j += 1
#         if j > 2000:
#             break
#         info_li.append(i)
#     return info_li
Пример #17
0
 def conn(self):
     client = pymongo.MongoClient(host=Config().MONGODB['host'],
                                  port=Config().MONGODB['port'])
     self.tdb = client[Config().MONGODB['dbName']]
Пример #18
0
def get_coll():
    client = pymongo.MongoClient('127.0.0.1', 27017)
    db = client.nnn
    user = db.user_colletion
    return user
Пример #19
0
import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]

browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
wait = WebDriverWait(browser, 10)
browser.set_window_size(1400, 900)


##
def search():
    try:
        browser.get('https://www.taobao.com/')
        input = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '#q')))
        submit = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR,
                 '#J_TSearchForm > div.search-button > button')))
        input.send_keys('电脑')
        submit.click()
        total = wait.until(
Пример #20
0
 def open_spider(self, spider):
     self.client = pymongo.MongoClient(self.mongo_uri)
     self.db = self.client[self.mong_db]
Пример #21
0
import logging
import random
import pickle
import os
import pymongo
from telegram.ext import ConversationHandler
from telegram import ReplyKeyboardMarkup, ReplyKeyboardRemove, ChatAction
from quizbot.quiz.question_factory import QuestionBool, QuestionChoice, QuestionChoiceSingle, \
    QuestionNumber, QuestionString
from quizbot.quiz.attempt import Attempt

logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

db = pymongo.MongoClient(os.environ.get('MONGODB')).quizzes
# Dict to store user data like an attempt instance
userDict = dict()


def start(update, _):
    """
    Starts a conversation about an attempt at a quiz.
    Welcomes the user and asks for a quiz.
    """
    logger.info('[%s] Attempt initialized', update.message.from_user.username)

    if update.message.from_user.id in userDict:
        # user is in the middle of a quiz and can't attempt a second one
        logger.info('[%s] Attempt canceled because the user is in the middle of a quiz.',
                    update.message.from_user.username)
Пример #22
0
 def __init__(self):
     self.client = pymongo.MongoClient(host=mongo_host, connect=False)
     self.db = self.client["zimuzu"]
Пример #23
0
import pymongo
import os
from flask.ext.httpauth import HTTPBasicAuth


def encode_mongo_obj(obj):
    """Convert a Mongo object into a dict for Flask-RESTful"""
    obj['_id'] = str(obj['_id'])
    return obj


# Database stuff is here because we don't wanna connect multiple times
uri = os.environ.get('MONGOLAB_URI')
print("connecting to URI: {}".format(uri))
client = pymongo.MongoClient(uri)
db = client.get_default_database()

# Auth stuff is global
auth = HTTPBasicAuth()
Пример #24
0
#T Fabiha
#SoftDev2 pd7
#K06 -- Yummy Mongo Py
#2019-03-01

import pymongo

SERVER_ADDR = "159.65.231.92"
connection = pymongo.MongoClient(SERVER_ADDR)
db = connection.test
collection = db.restaurants

def in_borough( borough ):
    obj = collection.find({"borough" : borough})

    for i in obj:
        print(i)

def in_zip( zipcode ):
    obj = collection.find({"address.zipcode" : zipcode})

    for i in obj:
        print(i)

def in_zip_w_grade( zipcode, grade ):
    obj = collection.find({"$and" : [{"address.zipcode" : zipcode}, {"grades.grade" : grade}]})

    for i in obj:
        print(i)

def in_zip_below( zipcode, score ):
Пример #25
0
 def __init__(self):
     client = pymongo.MongoClient("mongo", 27017)
     self.account_collection = client["accounts"]['account']
Пример #26
0
def check_mongodb(host, port, user, passwd, server_id, tags):
    try:
        func.mysql_exec(
            "insert into mongodb_status_history SELECT *,LEFT(REPLACE(REPLACE(REPLACE(create_time,'-',''),' ',''),':',''),12) from mongodb_status where server_id='%s';"
            % (server_id), '')
        func.mysql_exec(
            "delete from mongodb_status where server_id='%s';" % (server_id),
            '')

        #connect = pymongo.Connection(host,int(port))
        client = pymongo.MongoClient(host, int(port))
        db = client['admin']
        db.authenticate(user, passwd)
        serverStatus = client.admin.command(
            bson.son.SON([('serverStatus', 1), ('repl', 2)]))
        time.sleep(1)
        serverStatus_2 = client.admin.command(
            bson.son.SON([('serverStatus', 1), ('repl', 2)]))
        connect = 1
        ok = int(serverStatus['ok'])
        version = serverStatus['version']
        uptime = serverStatus['uptime']
        connections_current = serverStatus['connections']['current']
        connections_available = serverStatus['connections']['available']
        globalLock_activeClients = serverStatus['globalLock']['activeClients'][
            'total']
        globalLock_currentQueue = serverStatus['globalLock']['currentQueue'][
            'total']
        mem_bits = serverStatus['mem']['bits']
        mem_resident = serverStatus['mem']['resident']
        mem_virtual = serverStatus['mem']['virtual']
        mem_supported = serverStatus['mem']['supported']
        mem_mapped = serverStatus['mem']['mapped']
        mem_mappedWithJournal = serverStatus['mem']['mappedWithJournal']
        network_bytesIn_persecond = int(
            serverStatus_2['network']['bytesIn']) - int(
                serverStatus['network']['bytesIn'])
        network_bytesOut_persecond = int(
            serverStatus_2['network']['bytesOut']) - int(
                serverStatus['network']['bytesOut'])
        network_numRequests_persecond = int(
            serverStatus_2['network']['numRequests']) - int(
                serverStatus['network']['numRequests'])
        opcounters_insert_persecond = int(
            serverStatus_2['opcounters']['insert']) - int(
                serverStatus['opcounters']['insert'])
        opcounters_query_persecond = int(
            serverStatus_2['opcounters']['query']) - int(
                serverStatus['opcounters']['query'])
        opcounters_update_persecond = int(
            serverStatus_2['opcounters']['update']) - int(
                serverStatus['opcounters']['update'])
        opcounters_delete_persecond = int(
            serverStatus_2['opcounters']['delete']) - int(
                serverStatus['opcounters']['delete'])
        opcounters_command_persecond = int(
            serverStatus_2['opcounters']['command']) - int(
                serverStatus['opcounters']['command'])

        #replset
        try:
            repl = serverStatus['repl']
            setName = repl['setName']
            replset = 1
            if repl['secondary'] == True:
                repl_role = 'secondary'
                repl_role_new = 's'
            else:
                repl_role = 'master'
                repl_role_new = 'm'
        except:
            replset = 0
            repl_role = 'master'
            repl_role_new = 'm'
            pass

        ##################### insert data to mysql server#############################
        sql = "insert into mongodb_status(server_id,host,port,tags,connect,replset,repl_role,ok,uptime,version,connections_current,connections_available,globalLock_currentQueue,globalLock_activeClients,mem_bits,mem_resident,mem_virtual,mem_supported,mem_mapped,mem_mappedWithJournal,network_bytesIn_persecond,network_bytesOut_persecond,network_numRequests_persecond,opcounters_insert_persecond,opcounters_query_persecond,opcounters_update_persecond,opcounters_delete_persecond,opcounters_command_persecond) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
        param = (server_id, host, port, tags, connect, replset, repl_role, ok,
                 uptime, version, connections_current, connections_available,
                 globalLock_currentQueue, globalLock_activeClients, mem_bits,
                 mem_resident, mem_virtual, mem_supported, mem_mapped,
                 mem_mappedWithJournal, network_bytesIn_persecond,
                 network_bytesOut_persecond, network_numRequests_persecond,
                 opcounters_insert_persecond, opcounters_query_persecond,
                 opcounters_update_persecond, opcounters_delete_persecond,
                 opcounters_command_persecond)
        func.mysql_exec(sql, param)
        role = 'm'
        func.update_db_status_init(repl_role_new, version, host, port, tags)

    except Exception, e:
        logger_msg = "check mongodb %s:%s : %s" % (host, port, e)
        logger.warning(logger_msg)

        try:
            connect = 0
            sql = "insert into mongodb_status(server_id,host,port,tags,connect) values(%s,%s,%s,%s,%s)"
            param = (server_id, host, port, tags, connect)
            func.mysql_exec(sql, param)

        except Exception, e:
            logger.error(e)
            sys.exit(1)
Пример #27
0
def getAnswers():
    myClient = pymongo.MongoClient(mongo_connectStr)
    mydb = myClient['helpit']
    user_collection = mydb['user']
    topic_collection = mydb['topics']
    analysis_collection = mydb['analysis']
    user_accounts = user_collection.find({'source': 'stackoverflow'})
    today = datetime.date.today()
    lastweek = today - datetime.timedelta(days=7)
    start = str(
        int(
            datetime.datetime(lastweek.year, lastweek.month, lastweek.day, 0,
                              0, 0).timestamp()))

    for account in user_accounts:
        if 'account' in account.keys():
            accountid = int(account['account'])
            upn = account['upn']
            url_str = "https://api.stackexchange.com/2.2/users/" + str(
                accountid
            ) + "/answers?order=desc&sort=activity&site=stackoverflow&filter=!b1MMEr*sm*wys1&pagesize=100&fromdate=" + start
            pageIndex = 1
            while True:
                data = requests.get(url_str + "&page=" + str(pageIndex),
                                    headers={
                                        "Content-type": "text/json"
                                    }).json()
                messages = data['items']
                if messages is None or len(messages) == 0:
                    break
                pageIndex += 1

                for j in range(len(messages)):
                    post_time = datetime.datetime.fromtimestamp(
                        messages[j]['last_activity_date'])
                    is_accepted = messages[j]['is_accepted']
                    comments = []
                    comment_count = messages[j]['comment_count']
                    topics = getTopics(topic_collection, messages[j]['body'])
                    score = 0.0
                    if comment_count > 0:
                        comments = messages[j]['comments']
                    for t in range(comment_count):
                        score += getSentiment(comments[t]['body'])
                    if is_accepted:
                        score = score * 1.5
                    score += 0.5

                    analysis_collection.insert_one({
                        'userId': accountid,
                        'upn': upn,
                        'source': 'stackoverflow',
                        'post_time': post_time,
                        'is_accepted': is_accepted,
                        'comment_count': comment_count,
                        'score': score,
                        'topic': topics
                    })

    report_col = mydb['report']
    stime = datetime.datetime(lastweek.year, lastweek.month, lastweek.day, 0,
                              0, 0)
    today_time = datetime.datetime(today.year, today.month, today.day, 0, 0, 0)
    while stime < today_time:
        stime += datetime.timedelta(days=1)
        dtmp = stime + datetime.timedelta(days=1)
        adocs = analysis_collection.aggregate([{
            "$match": {
                'post_time': {
                    '$lt': dtmp,
                    '$gte': stime
                },
                'source': 'stackoverflow'
            }
        }, {
            "$group": {
                '_id': '$upn',
                'score': {
                    '$sum': '$score'
                },
                'count': {
                    '$sum': 1
                }
            }
        }])

        for a in adocs:
            report_col.insert_one({
                'upn': a['_id'],
                'source': 'stackoverflow',
                'count': a['count'],
                'score': a['score'],
                'date': time.mktime(stime.timetuple()) * 1000
            })
Пример #28
0
# -*- coding: utf-8 -*-
# 2018/4/10 9:03
# 爬取豆瓣音乐top250的数据,并写入mongodb数据库中。包括:歌名,表演者,流派,发行时间,出版者和评分

'''
    之所以选用正则表达式,主要是因为歌手的信息格式不一致。
    比如https://music.douban.com/subject/6064884/ https://music.douban.com/subject/4060882/
    运行参考时间:412s
'''

import requests
import pymongo
import time
import re

client = pymongo.MongoClient('localhost', 27017)
mydb = client['mydb']
musictop = mydb['musictop']

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3371.0 Safari/537.36',
}


def get_url(url):
    t = requests.get(url, headers=headers).text
    hrefs = re.findall('<a class="nbg" href="(.*?)"', t, re.S)
    for href in hrefs:
        res = requests.get(href, headers=headers)
        name = re.findall('<div id="wrapper">.*?<h1>.*?<span>(.*?)</span>', res.text, re.S)[0]
        author = re.findall('表演者:.*?>(.*?)</a>', res.text, re.S)[0]
import pymongo

myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["testdatabase"]

mycol = mydb["employee"]

mydict = { "name": "John", "address": "Highway 37" }

x = mycol.insert_one(mydict)

print("last insrted ID : ", x.inserted_id)
Пример #30
0
        with concurrent.futures.ThreadPoolExecutor() as executor:
            threads = [executor.submit(crawler) for i in range(5)]
            results = [thread.result() for thread in threads]
            return results  # list of lists returned by 5 threads
    except:
        logger.debug("Threads couldn't be made.")


if __name__ == "__main__":
    while True:
        logger.debug('Starting process')
        scraped_links = thread_crawler()  # starting process
        for result in scraped_links:
            try:
                logger.debug("Extracted " + str(len(result)) +
                             " links from HTML")
            except:
                logger.debug("Nothing returned in this cycle.")

        main_client = pymongo.MongoClient(config['localhost'],
                                          config['port_num'])
        database = main_client[config['database_name']]
        collection = database[config['collection_name']]
        if collection.count_documents(
            {}
        ) >= config['max_limit']:  # check if we have scraped max tweets yet
            logger.debug(
                str(config['max_limit']) + " links scraped. Ending process!!!")
            break
        time.sleep(5.0)