示例#1
0
    def get_user_commits(username, dbname='ansibot', collection='api_commits'):
        '''Retrieve all commits authored by username'''

        client = MongoClient()
        mongo_db = getattr(client, dbname)
        mongo_collection = getattr(mongo_db, collection)

        pipeline = [
            {
                '$match': {'author.login': username},
            },
            {
                '$project': {
                    '_id': 0,
                    'author.login': 1,
                    'commit.committer.date': 1,
                    'sha': 1
                }
            },
            {'$sort': {'commit.committer.date': 1}}
        ]
        cursor = mongo_collection.aggregate(pipeline)
        commits = list(cursor)
        client.close()
        return commits
示例#2
0
文件: views.py 项目: nguerrero/pfc
def yearStatistics(request):
	#Código de mongodb
	client = MongoClient(ip, port)
	db = client['twitterdata']
	tweetCollection = db['tweets']
	#Tweets por meses
	result = tweetCollection.map_reduce(mapCuentaMeses, reduce,"myresult6")
	for doc in result.find():
		result.update({'_id':doc['_id']}, {'$set': {'date':doc['_id']}})
	anio = result.find(limit=10).sort('date',1)
	anioArray = []
	for a in anio:
		anioArray.append(a['value'])
	anio = result.find(limit=10).sort('date',1)
	# Retweets
	result2 = tweetCollection.map_reduce(mapCuentaMesesRT, reduce,"myresult6")
	for doc in result2.find():
		result2.update({'_id':doc['_id']}, {'$set': {'date':doc['_id']}})
	anioRT = result2.find(limit=10).sort('date',1)
	anioArrayRT = []
	for a in anioRT:
		anioArrayRT.append(a['value'])
	anioRT = result2.find(limit=10).sort('date',1)
	client.close()
	#return the template
	return render_to_response('yearStatistics.html', locals())
示例#3
0
def load_20_news_group():
    """ Loads the 20 news group corpus into
        a mongo database
    """
    mc = MongoClient()
    db = mc["astrology"]

    coll_name = "corpora.twenty_news_group"
    meta_coll_name = "corpora.twenty_news_group.meta"

    # Drop if already exists
    db.drop_collection(coll_name)
    db.drop_collection(meta_coll_name)

    coll = db[coll_name]
    meta_coll = db[meta_coll_name]

    labels = set()

    for batch in get_20_news_group(300, labels):
        coll.insert_many(batch)

    meta_doc = {"labels": list(labels)}
    meta_coll.insert_one(meta_doc)

    coll.create_index("label")

    mc.close()
示例#4
0
    def get_user_issues(username, dbname='ansibot', collection='api_issue'):
        '''Retrieve all commits authored by username'''

        client = MongoClient()
        mongo_db = getattr(client, dbname)
        mongo_collection = getattr(mongo_db, collection)

        pipeline = [
            {
                '$match': {'user.login': username},
            },
            {
                '$project': {
                    '_id': 0,
                    'user.login': 1,
                    'created_at': 1,
                    'html_url': 1
                }
            },
            {'$sort': {'created_at': 1}}
        ]
        cursor = mongo_collection.aggregate(pipeline)
        issues = list(cursor)
        client.close()
        return issues
def run(host,database,graphname):
    # Create an empty response object.
    response = {}
    collectionNames = []

   # this method traverses the documents in the selected graph collection and builds a JSON object
   # that represents the graph to the application.  It might be faster to adopt to using a standard 
   # networkX JSON description, but this is certainly simple and flexible for an initial prototype.

    client = MongoClient(host, 27017)
    db = client[database]
    # get a list of all collections (excluding system collections)
    collection = db[graphname]
    
   
    # loop through the records in the network and take the appropriate action for each type
    nodecount = collection.find({'type':'node'}).count()
    edgecount = collection.find({'type':'link'}).count()


    # Pack the results into the response object, and return it.
    response['result'] = {}
    response['result']['nodes'] = nodecount
    response['result']['links'] = edgecount
    client.close()

    # Return the response object.
    #tangelo.log(str(response))
    return json.dumps(response)
示例#6
0
def lookup_phenotype_results_by_id(id_list: list):
    client = MongoClient(util.mongo_host, util.mongo_port)
    db = client[util.mongo_db]
    obj = dict()
    obj['results'] = list()
    obj['indexes'] = dict()

    try:
        # db.phenotype_results.find({"_id": { $in: [ObjectId("5b117352bcf26f020e392a9c"), ObjectId("5b117352bcf26f020e3926e2")]}})
        # TODO TODO TODO
        ids = list(map(lambda x: ObjectId(x), id_list))
        res = db.phenotype_results.find({
            "_id": {
                "$in": ids
            }
        })
        obj['results'] = list(res)
        n = 0
        for o in obj['results']:
            id = str(o['_id'])
            obj['indexes'][id] = n
            n = n + 1

    except Exception as e:
        traceback.print_exc(file=sys.stdout)
        obj['success'] = False
    finally:
        client.close()

    return obj
示例#7
0
def phenotype_subjects(job_id: str, phenotype_final: bool):
    client = MongoClient(util.mongo_host, util.mongo_port)
    db = client[util.mongo_db]
    results = []
    # db.phenotype_results.aggregate([  {"$match":{"job_id":{"$eq":10201}, "phenotype_final":{"$eq":true}}},
    #  {"$group" : {_id:"$subject", count:{$sum:1}}} ])
    try:
        q = [
            {
                "$match": {
                    "phenotype_final": {
                        "$eq": phenotype_final
                    },
                    "job_id": {
                        "$eq": int(job_id)
                    }
                }},
            {
                "$group": {
                    "_id": "$subject",
                    "count": {
                        "$sum": 1
                    }
                }
            }
        ]
        results = list(db.phenotype_results.aggregate(q))
        results = sorted(results, key=lambda r: r['count'], reverse=True)
    except Exception as e:
        traceback.print_exc(file=sys.stdout)
    finally:
        client.close()

    return results
示例#8
0
	def process(self):
		client = MongoClient('localhost',44444)
		db_temp_train = client['vsm_all_second']
		collection1_temp_train = db_temp_train['collection1']
		collection2_temp_train = db_temp_train['collection2']		
		collection3_temp_train = db_temp_train['collection3']
		collection4_temp_train = db_temp_train['collection4']
		collection5_temp_train = db_temp_train['collection5']

		lineNum = 1
		pat = "sa(\d)(.*)"
		with open(os.path.join(self.fileroot,self.filename),"r") as fr:
			for line in fr:
				# 这是处理有的评论行的格式是异常的
				if not re.findall(pat,line):
					print("\n " + str(lineNum) + " something wrong !")
					continue

				
				result = re.findall(pat,line)
				starNum = result[0][0]
				if starNum ==   '1':
					collection1_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '2':
					collection2_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '3':
					collection3_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '4':
					collection4_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '5':
					collection5_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
	
				print('process {0} lines'.format(lineNum),end='\r\t')
				lineNum += 1
		client.close()
def dump_articles():
    connection = MongoClient('localhost', 27017)

    db = connection.PTEST_BACKUP

    results = db.crawling.find({}, {'_id': False})

    """
    {
            "_id" : ObjectId("54dd29d2b396811764a01330"),
            "url" : "http://www.nasa.gov/pdf/55395main_12%20Earth%20Science.pdf",
            "home" : "NASA",
            "abstract" : "The mission of NASA's Earth Science ... and help answer qu
    estions concerning many related aspects of ... forecasters in assessing particul
    ate pollutio ...",
            "title" : "Earth Science - NASA",
            "keyword" : "aerosols+(pollution+aspects)",
            "stored" : true,
            "complete" : false,
            "key" : "aerosols (pollution aspects)",
            "hashed" : "aHR0cDovL3d3dy5uYXNhLmdvdi9wZGYvNTUzOTVtYWluXzEyJTIwRWFydGglMjBTY2llbmNlLnBkZg=="
    }
    """


    # upload via POST endpoint
    from scripts.remote.remote import post_curling
    import json

    for record in results:
        post_curling(_CRAWLING_POST['local'], {'resource': json.dumps(record), 'pwd': _TEMP_SECRET}, display=True)

    # close the connection to MongoDB
    connection.close()
示例#10
0
def insertTemplate(template):
	dbCon = MongoClient( databasePath )
	database = dbCon['allTemplates']
	collection = database['template']
	template['_id'] = ObjectId()
	collection.insert_one(template)
	dbCon.close()
示例#11
0
class MongoDBPipeline(object):

    collection_name = 'aqi'
    items = []


    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )

    def open_spider(self, spider):
        print '------------connect to mongodb:', self.mongo_uri

        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def close_spider(self, spider):
        #insert to mongodb when close spider
        print '------------insert data:',len(self.items)
        print self.items
        self.db[self.collection_name].insert_many(self.items)
        self.client.close()

    def process_item(self, item, spider):

        self.items.append(item)
        #self.db[self.collection_name].insert(dict(item))
        return item
示例#12
0
def sync():
    client=MongoClient(MONGO_URI)
    bi=client[MONGO_BI]
    target=bi['product_bi_final']
    mapping=bi['mapping']

    for each in mapping.find({}, {'bi_cat_id': 1, 'bi_cat_name': 1, '_id': 0, 'erp_cat_id': 1}):
        result = target.find({'category_id': each['bi_cat_id']}, {"updated_time": 1, '_id': 0}).sort([('updated_time', -1)]).limit(1)
        try:
            last_time = next(result)['updated_time']
        except:
            last_time = datetime(1992, 8, 24)
        with mysql_con.cursor() as cur:
            sql='''
                SELECT
                    updated_time,
                    product_url,
                    sort_num,
                    sort_type,
                    sale_price,
                    sale_num,
                    rating,
                    product_name,
                    product_image,
                    original_price,
                    website_id AS dw_web_id,
                    comment_count,
                    goods_sn 
                FROM
                    website_product 
                WHERE
                    category_id =%s 
                    AND updated_time >= "%s" 
                ORDER BY
                    updated_time ASC
            '''
            cur.execute(sql,(each['bi_cat_id']-cat_offset,last_time)) # 参数化调用防止SQL注入
            for item in cur:
                data={
                    'product_url':item[1],
                    'category_id':each['bi_cat_id'],
                    'erp_cat_id':each['erp_cat_id'],
                    'comment_count':int(item[11]),
                    'currency':"¥",
                    "original_price" : 0,
                    'product_image':re.findall(r'https://.+?\.jpg|http://.+?\.jpg',item[8]),
                    'product_name':item[7],
                    'rating':item[6],
                    'sale_num':int(item[5]),
                    'sale_price':int(item[4]*100),
                    'sort_num':item[2],
                    'sort_type':item[3],
                    'updated_time':item[0],
                    'dw_web_name':dw_web_name,
                    'goods_sn':item[12],
                }
                target.update_one({'goods_sn':goods_sn,'dw_web_id':item[10]+web_offset},{'$set':data},upsert=True)
                print('processing {}'.format(data['product_url']))   
                
    client.close()
示例#13
0
def getAllModules():
    dbCon = MongoClient( databasePath )
    database = dbCon['mibModules']
    posts = database['mib']
    output = posts.find()
    dbCon.close()
    return output
示例#14
0
def processAdjustCube(countryThisRoundAdjust):
    global adjustDict
    global countryAdjust

    client = MongoClient()
    db = client['login_history']
    col = db['col_adjust']
    today = datetime.datetime.now().date()
    ds = datetime.datetime(*(today.timetuple()[:6]))

    for country,adjustList in countryThisRoundAdjust.iteritems():
        intervallist = adjustList[0]
        numlist = adjustList[1]
        tmpResult = col.update({'recordtime':ds,'targetCountry':country},
                               {'$set':{'dateintervalList':intervallist,'numlist':numlist}},
                               upsert = True)

        print tmpResult

    for country,adjustList in countryThisRoundAdjust.iteritems():
        countrydetail = adjustDict.get(country,dict())
        intervallist = adjustList[0]
        numlist = adjustList[1]
        for index,interval in enumerate(intervallist):
            tmplist = countrydetail.get(interval,list())
            tmplist.append(numlist[index])
            countrydetail[interval] = tmplist
        adjustDict[country] = countrydetail

    getCountryAdjust()
    client.close()
示例#15
0
class MyOffsiteMiddleware(OffsiteMiddleware):
    def __init__(self, *args, **kwargs):
        super(MyOffsiteMiddleware, self).__init__()
        self.client = None
        self.db = None
        self.link_collection = None

    def spider_opened(self, spider):
        super(MyOffsiteMiddleware, self).spider_opened(spider)
        dbname = settings.MONGO_DB['name']
        collection_outlinks = settings.MONGO_DB['outlink_collection']
        self.client = MongoClient()
        self.db = self.client[dbname]
        collection = self.db[collection_outlinks][spider.collection_name]
        if collection.name in self.db.collection_names():
            self.db.drop_collection(collection.name)
        self.link_collection = collection

    def __del__(self):
        if self.client is not None:
            self.client.close()

    def should_follow(self, request, spider):
        ans = super(MyOffsiteMiddleware, self).should_follow(request, spider)
        if not ans:
            lnk = WalkerItem()
            lnk['status'] = ''
            lnk['parent'] = request.headers.get('Referer', '')
            lnk['response_hash'] = ''
            lnk['type'] = ''
            lnk['page'] = request.url
            self.link_collection.insert(dict(lnk))

        return ans
示例#16
0
 def check(self, resource, project_id, timestamp, value):
     client = MongoClient(self.uri)
     collection = client.log_service.quotas
     conditions = {'resource':resource, 'project_id':project_id, 'timestamp':timestamp, 'value':value}
     is_saved = collection.find(conditions).count() > 0
     client.close()
     return is_saved
示例#17
0
def remove_peer(peer):
    host=(socket.gethostname())
    c = MongoClient(host, 27017)
    db=c.tejo
    status=db.status
    status.remove({'peer':peer})
    c.close()    
示例#18
0
def main(start_from=None):

    # Load config information
    load_config()

    # Connect to RabbitMQ Queue
    connection, channel = get_queue_channel(RABBIT_HOST)

    # Connect to databse
    client = MongoClient(MONGO_HOST, MONGO_PORT)
    db = client[MONGO_DB]
    db.authenticate(MONGO_USER, MONGO_PWD)
    db_repos = db[MONGO_COLL]

    last_id = start_from  # Last id variable to be advanced
    reauth = True  # Reauth variable to check if we need to reauthenticate for GitHub
    gh = None  # Just for good measure

    while 1:
        # Authenticate on GitHub and get all repos
        if reauth:
            gh = github3.login(GH_USERS[GH_CUR_USR]['login'], GH_USERS[GH_CUR_USR]['pwd'])
        repos = gh.iter_all_repos(since=last_id)

        # Crawl repos
        reauth, last_id = start_crawl(repos, db_repos, gh, channel, last_id)

    #Close connection to databse
    client.close()

    #Close connection to queue
    channel.close()
    connection.close()
示例#19
0
class MongoStore(Store):
    def __init__(self, subscription_id, config):
        super().__init__(subscription_id)

        self._client = MongoClient(config['MONGO_HOST'],
                                   config['MONGO_PORT'])

        self._db = self._client.get_database(config['MONGO_DATABASE'])
        self._collection = self._db[config['MONGO_COLLECTION']]

    def set_state(self, state):
        self._collection.replace_one({'_id': self._subscription_id}, state, upsert=True)

    def set_value(self, key, value):
        self._collection.update_one({'_id': self._subscription_id}, {'$set': {key: value}}, upsert=True)

    def push_all(self, key, values):
        self._collection.update_one({'_id': self._subscription_id}, {'$push': {key: {'$each': values}}}, upsert=True)

    def get_value(self, key, default=None):
        state = self.get_state()
        if state:
            return state.get(key, default)
        return default

    def get_state(self):
        return self._collection.find_one({'_id': self._subscription_id})

    def get_collection(self):
        return self._collection

    def close(self):
        self._client.close()
示例#20
0
class MTSGetdataPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri = crawler.settings.get("MONGO_URI"),
            mongo_db = crawler.settings.get("MONGO_DATABASE")
        )

    def open_spider(self, spider):
        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        # collection_name = self.__class__.__name__
        # tmp = dict(item)
        # print "***************************", tmp, "!!!!!!!!!!!!!!!!!!!!!!!"
        # self.db[collection_name].insert(tmp)
        JsonFile.append(dict(item))
        return item
        # return None


    def close_spider(self, spider):
        print len(JsonFile)
        fp = open("TestMidi.json","wb")
        fp.write(json.dumps(JsonFile))
        self.client.close()
示例#21
0
文件: mongo.py 项目: tnlin/ptt-scrapy
    def init_from_mongo(self):
        client = MongoClient('mongodb://localhost:27017/') 
        db = client.ptt
        posts = db.gossiping_38k 
        jieba.set_dictionary('extra_dict/dict.txt.big')
        jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt")   
        for post in posts.find():
            #For content
            d = defaultdict(int)
            content = post['content']
            if post['score'] != 0:
                for l in content.split('\n'):
                    if l:
                        for w in jieba.cut(l):
                            d[w] += 1
            if len(d) > 0:
                self.words.append(d)
                self.scores.append(1 if post['score'] > 0 else 0)
            #For comments
            for comment in post['comments']:
                l = comment['content'].strip()
                if l and comment['score'] != 0:
                    d = defaultdict(int)
                    for w in jieba.cut(l):
                        d[w] += 1
                    if len(d) > 0:
                        self.c_words.append(d)
                        self.c_scores.append(1 if comment['score'] > 0 else 0)

        client.close()   
示例#22
0
    def add_domain(self, domain, ip, key, ttl = 120, timestamp = time.time() ):
        if domain not in self._domains:
            self._domains[domain] = Domain(domain, ip, key, ttl, timestamp)

            c = MongoClient("localhost",27017).p2pdns
	    c.domains.insert_one({"domain":domain,"ip":ip,"key":key,"ttl":ttl,"timestamp":timestamp})
            c.close()
示例#23
0
class Application(tornado.web.Application):
    def __init__(self):
	'''Store necessary handlers,
	   connect to database
	'''
    
        handlers = [(r"/[/]?", 
                        BaseHandler),

                    (r"/GetLocations[/]?",
                        predictionHandlers.GetLocationsHandler), 
                    (r"/GetLandmarks[/]?",
                        predictionHandlers.GetLandmarksHandler), 

                    (r"/AddLearningData[/]?",
                        predictionHandlers.AddLearningDataHandler), 
                    (r"/PredictLocation[/]?",
                        predictionHandlers.PredictLocationHandler)    
                    ]

        settings = {'debug':True}
        tornado.web.Application.__init__(self, handlers, **settings)

        self.client  = MongoClient() # local host, default port
        self.db = self.client.exploreSMU # sklearndatabase # database with labeledinstances, models
        self.clf = []

        #self.client.close() # this opened a socket -- lets close that connection

    def __exit__(self):
        self.client.close() # just in case
示例#24
0
def store_in_mongo(lst_of_dcts, db_name, collection_name, key=None): 
    """Store the list of dictionaries in Mongo

    Args: 
        lst_of_dicts: List of dictionaries to insert into Mongo. 
        db_name: String - database name
        collection_name: String - collection name
    """
    
    client = MongoClient()
    db = client[db_name]
    collection = db[collection_name]
    
    if key is not None: 
        store_in_mongo_by_key(lst_of_dcts, collection, key)
    else: 
        # Check if the length is one, in which case we need to use 
        # insert_one. Otherwise, make sure that it's not empty (i.e. 
        # the `elif` statement) below, and then insert many. If it's 
        # empty, then just don't do anything and close the client. 
        if len(lst_of_dcts) == 1: 
            collection.insert_one(lst_of_dcts[0])
        elif lst_of_dcts: 
            collection.insert_many(lst_of_dcts)

    client.close()
示例#25
0
def write2mongo(stations):
    """
    将车站信息写入到mongodb中
    :param stations: 车站信息,用 | 分隔统一车站的各个信息
    :return:
    """
    try:
        logger = logging.getLogger(__name__)
        logger.info('starting write station info')
        con = MongoClient('localhost', 27017)  # 连接mongodb
        data_list = []
        for station in stations:
            parts = ('' + station).split('|')
            data = {
                'Chinese': parts[1],
                'ext': parts[2],
                'pinyin': parts[3],
                'abbr': parts[4],
                'order': parts[5]
            }
            data_list.append(data)
        local = con.get_database('python')
        collection = local.get_collection('12306_station')
        collection.remove()  # clear data before insert ,in case duplicate
        collection.insert_many(data_list)
        con.close()
        logger.info('write {0} station info : Done'.format(len(data_list)))
    except IOError as e:
        logger.error(e)
示例#26
0
class MongoOperator:
    def __init__(self, db):
        self.dbName = db

    def setUpConnection(self):
        self.client = MongoClient("localhost", 27017)
        self.db = self.client[self.dbName]

    def setUpCollection(self, collName):

        if collName in self.db.collection_names():
            self.collection = self.db.get_collection(collName)
        else:
            self.db.create_collection(collName)
            self.collection = self.db.get_collection(collName)

    def getOne(self):
        print(self.collection.find_one())

    def getAll(self):
        return self.collection.find({})

    def insertOne(self, res):
        self.collection.insert_one(res)

    def insertMany(self, listofRes):
        self.collection.insert_many(listofRes)

    def closeConnection(self):
        self.client.close()
示例#27
0
文件: pipelines.py 项目: qwteng/Eagle
class MongoDBPipeline(object):
	def __init__(self, mongodb_server, mongodb_port, mongodb_db, mongodb_collection):
		self.mongodb_server = mongodb_server
		self.mongodb_port = mongodb_port
		self.mongodb_db = mongodb_db
		self.mongodb_collection = mongodb_collection
	
	@classmethod
	def from_crawler(cls, crawler):
		print "in crawler"
		return cls(
			mongodb_server= crawler.settings.get('MONGODB_SERVER'),
			mongodb_port =  int(crawler.settings.get('MONGODB_PORT')),
			mongodb_db=crawler.settings.get('MONGODB_DB'),
			mongodb_collection=crawler.settings.get('MONGODB_COLLECTION')
		)

	def open_spider(self, spider):
		self.client = MongoClient(self.mongodb_server, self.mongodb_port)
		self.db = self.client[self.mongodb_db]
		self.collection = self.db[self.mongodb_collection]

	def close_spider(self, spider):
		self.client.close()

	def process_item(self, item, spider):
		print "in pipeline"
		#log.msg("begin insert data", level=log.DEBUG, splider=splider)
		self.collection.insert(dict(item))
			

		return item
示例#28
0
    def add_node(self, host, port):
        if host not in self._nodes:
            self._nodes[host] = int(port)

            c = MongoClient("localhost",27017).p2pdns
	    c.nodes.insert_one({"ip":host,"port":port})
            c.close()
class MongoAnalyticsTest(unittest.TestCase):

    #
    # Initialisation des membres privés à compléter
    def setUp(self):
        self.mongoclient = MongoClient()
        self.db = self.mongoclient.analytics
        self.hits = self.db.hits


    def tearDown(self): 
        self.mongoclient.close()


    """
    Trouver la plus grande affluence (nombre de requêtes par jour) pour l'url http://www.lateral-thoughts.com
    avec le framework d'aggrégation
    Doc à lire : 
    - $year http://docs.mongodb.org/manual/reference/operator/aggregation/year/
    - $month http://docs.mongodb.org/manual/reference/operator/aggregation/month/
    - $dayOfMonth http://docs.mongodb.org/manual/reference/operator/aggregation/dayOfMonth/
    """
    @unittest.skip('Remove to play this test')
    def testFindHighestHitsForUrl(self):
        pipeline = []
        result = self.hits.aggregate(pipeline)
        self.assertEqual(result['result'][0]['hits'], 66)
        self.assertEqual(result['result'][0]['_id']['y'], 2012)
        self.assertEqual(result['result'][0]['_id']['m'], 3)
        self.assertEqual(result['result'][0]['_id']['d'], 23)
def run(host,database,graphA,graphB,handle,displaymode):
    # Create an empty response object.
    response = {}

   # look through the collections in the ivaan database and return the name of all collections
   # that match the naming profile for tables.  This is matching to see if the collection name
   # begins with "seeds_" or not, since this routine can return the matching graphs (that don't start
    # with 'seeds_') or the matching seeds.
    
    # build topk collection name from 
    topk_collection_name = 'topk_'+graphA+'_'+graphB
    #topk_collection_name = 'topk_twitter_geosample_mentions_v2_october_combined_instagram_mentions_nodelink_october'
    print 'looking for topk in collection', topk_collection_name
    #topk_collection_name = 'topk'

    client = MongoClient(host, 27017)
    db = client[database]
    topk_collection = db[topk_collection_name]

    # get a list of all collections (excluding system collections)
    query = {'ga':handle}
    tablerows = []
    # return only the columns to potentially display in LineUp.  We don't want to return the gA entity we used to search by
    topk = topk_collection.find(query,{'_id':0,'ga':0})
    for row in topk:
        tablerows.append(row)

    client.close()

    # Pack the results into the response object, and return it.
    response['result'] = tablerows

    # Return the response object.
    #tangelo.log(str(response))
    return json.dumps(response)
示例#31
0
class ContextClassHarvester:

    #DEFAULT_CONFIG_SECTION = 'CONFIG'
    #HARVESTER_MONGO_HOST = 'harvester.mongo.host'
    #HARVESTER_MONGO_PORT = 'harvester.mongo.port'

    #ORGHARVESTER_MONGO_HOST = 'organization.harvester.mongo.host'
    #ORGHARVESTER_MONGO_PORT = 'organization.harvester.mongo.port'

    LOG_LOCATION = 'logs/entlogs/'

    CHUNK_SIZE = 250  # each file will consist of 250 entities
    WRITEDIR = os.path.join(os.path.dirname(__file__), '..', 'entities_out')
    CONFIG_DIR = os.path.join(os.path.dirname(__file__), '..', 'config')
    LANG_VALIDATOR = LanguageValidator()

    LABEL = 'label'
    TYPE = 'type'
    TYPE_STRING = 'string'
    TYPE_OBJECT = 'obj'
    TYPE_REF = 'ref'
    PROP_OWL_SAMEAS = 'owlSameAs'

    #TODO remove when whole code is switched to use the EnrichmentEntity language constants
    LANG_DEF = EnrichmentEntity.LANG_DEF
    LANG_EN = EnrichmentEntity.LANG_EN

    IGNORED_PROPS = ['about', '_id', "className", "edmOrganizationSector"]

    FIELD_MAP = {
        # maps mongo fields to their solr equivalents
        # TODO: there are numerous fields defined in the schema but not
        # found in the actual data. They are accordingly not represented here.
        # For a list of all fields that might conceivably exist in accordance
        # with the data model, see https://docs.google.com/spreadsheets/d/
        #           1b1UN27M2eCia0L54di0KQY7KcndTq8-wxzwM4wN-8DU/edit#gid=340708208
        'prefLabel': {
            LABEL: 'skos_prefLabel',
            TYPE: TYPE_STRING
        },
        'altLabel': {
            LABEL: 'skos_altLabel',
            TYPE: TYPE_STRING
        },
        'hiddenLabel': {
            LABEL: 'skos_hiddenLabel',
            TYPE: TYPE_STRING
        },
        'edmAcronym': {
            LABEL: 'edm_acronym',
            TYPE: TYPE_STRING
        },
        'note': {
            LABEL: 'skos_note',
            TYPE: TYPE_STRING
        },
        'begin': {
            LABEL: 'edm_begin',
            TYPE: TYPE_STRING
        },
        'end': {
            LABEL: 'edm_end',
            TYPE: TYPE_STRING
        },
        'owlSameAs': {
            LABEL: 'owl_sameAs',
            TYPE: TYPE_REF
        },
        'edmIsRelatedTo': {
            LABEL: 'edm_isRelatedTo',
            TYPE: TYPE_REF
        },
        'dcIdentifier': {
            LABEL: EnrichmentEntity.DC_IDENTIFIER,
            TYPE: TYPE_STRING
        },
        'dcDescription': {
            LABEL: 'dc_description',
            TYPE: TYPE_STRING
        },
        'rdaGr2DateOfBirth': {
            LABEL: 'rdagr2_dateOfBirth',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'rdaGr2DateOfEstablishment' : { 'label': 'rdagr2_dateOfEstablishment' , TYPE : TYPE_STRING },
        'rdaGr2DateOfDeath': {
            LABEL: 'rdagr2_dateOfDeath',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'rdaGr2DateOfTermination' : { 'label': 'rdagr2_dateOfTermination' , TYPE : TYPE_STRING },
        'rdaGr2PlaceOfBirth': {
            LABEL: 'rdagr2_placeOfBirth',
            TYPE: TYPE_STRING
        },
        'placeOfBirth': {
            LABEL: 'rdagr2_placeOfBirth',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'placeOfBirth_uri' : { 'label': 'rdagr2_placeOfBirth.uri' , TYPE : TYPE_STRING },
        'rdaGr2PlaceOfDeath': {
            LABEL: 'rdagr2_placeOfDeath',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'placeOfDeath_uri' : { 'label': 'rdagr2_placeOfDeath.uri' , TYPE : TYPE_STRING },
        'rdaGr2PlaceOfDeath': {
            LABEL: 'rdagr2_placeOfDeath',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'professionOrOccupation_uri' : { 'label': 'professionOrOccupation.uri' , TYPE : TYPE_STRING },
        'rdaGr2ProfessionOrOccupation': {
            LABEL: 'rdagr2_professionOrOccupation',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'gender' : { 'label': 'gender' , TYPE : TYPE_STRING },
        'rdaGr2Gender': {
            LABEL: 'rdagr2_gender',
            TYPE: TYPE_STRING
        },
        'rdaGr2BiographicalInformation': {
            LABEL: 'rdagr2_biographicalInformation',
            TYPE: TYPE_STRING
        },
        'latitude': {
            LABEL: 'wgs84_pos_lat',
            TYPE: TYPE_STRING
        },
        'longitude': {
            LABEL: 'wgs84_pos_long',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'beginDate' : { 'label': 'edm_beginDate' , TYPE : TYPE_STRING },
        #not used yet
        #'endDate' : { 'label': 'edm_endDate' , TYPE : TYPE_STRING },
        'isPartOf': {
            LABEL: 'dcterms_isPartOf',
            TYPE: TYPE_REF
        },
        #edm_isNextInSequence
        'isNextInSequence': {
            LABEL: 'edm_isNextInSequence',
            TYPE: TYPE_REF
        },
        'hasPart': {
            LABEL: 'dcterms_hasPart',
            TYPE: TYPE_REF
        },
        'hasMet': {
            LABEL: 'edm_hasMet',
            TYPE: TYPE_REF
        },
        'date': {
            LABEL: 'dc_date',
            TYPE: TYPE_STRING
        },
        'exactMatch': {
            LABEL: 'skos_exactMatch',
            TYPE: TYPE_STRING
        },
        'related': {
            LABEL: 'skos_related',
            TYPE: TYPE_REF
        },
        'broader': {
            LABEL: 'skos_broader',
            TYPE: TYPE_REF
        },
        'narrower': {
            LABEL: 'skos_narrower',
            TYPE: TYPE_REF
        },
        'related': {
            LABEL: 'skos_related',
            TYPE: TYPE_REF
        },
        'broadMatch': {
            LABEL: 'skos_broadMatch',
            TYPE: TYPE_REF
        },
        'narrowMatch': {
            LABEL: 'skos_narrowMatch',
            TYPE: TYPE_REF
        },
        'relatedMatch': {
            LABEL: 'skos_relatedMatch',
            TYPE: TYPE_REF
        },
        'exactMatch': {
            LABEL: 'skos_exactMatch',
            TYPE: TYPE_REF
        },
        'closeMatch': {
            LABEL: 'skos_closeMatch',
            TYPE: TYPE_REF
        },
        'notation': {
            LABEL: 'skos_notation',
            TYPE: TYPE_REF
        },
        'inScheme': {
            LABEL: 'skos_inScheme',
            TYPE: TYPE_REF
        },
        'note': {
            LABEL: 'skos_note',
            TYPE: TYPE_STRING
        },
        'foafLogo': {
            LABEL: 'foaf_logo',
            TYPE: TYPE_REF
        },
        'foafDepiction': {
            LABEL: 'foaf_depiction',
            TYPE: TYPE_REF
        },
        # not used yet
        #name' : { 'label' : 'foaf_name', TYPE : TYPE_STRING },
        'foafHomepage': {
            LABEL: 'foaf_homepage',
            TYPE: TYPE_REF
        },
        'foafPhone': {
            LABEL: 'foaf_phone',
            TYPE: TYPE_STRING
        },
        'foafMbox': {
            LABEL: 'foaf_mbox',
            TYPE: TYPE_STRING
        },
        'edmCountry': {
            LABEL: EnrichmentEntity.COUNTRY,
            TYPE: TYPE_STRING
        },
        'edmEuropeanaRole': {
            LABEL: EnrichmentEntity.EUROPEANA_ROLE,
            TYPE: TYPE_STRING
        },
        'edmOrganizationDomain': {
            LABEL: EnrichmentEntity.ORGANIZATION_DOMAIN,
            TYPE: TYPE_STRING
        },
        #TODO: remove, not supported anymore
        #'edmOrganizationSector' : { 'label' : 'edm_organizationSector', TYPE : TYPE_STRING},
        #'edmOrganizationScope' : { 'label' : 'edm_organizationScope', TYPE : TYPE_STRING},
        'edmGeographicLevel': {
            LABEL: EnrichmentEntity.GEOGRAPHIC_LEVEL,
            TYPE: TYPE_STRING
        },
        'address': {
            LABEL: 'vcard_hasAddress',
            TYPE: TYPE_OBJECT
        },
        #not sure if used anymore
        'address_about': {
            LABEL: 'vcard_hasAddress',
            TYPE: TYPE_STRING
        },
        'vcardStreetAddress': {
            LABEL: 'vcard_streetAddress',
            TYPE: TYPE_STRING
        },
        'vcardLocality': {
            LABEL: 'vcard_locality',
            TYPE: TYPE_STRING
        },
        #not used yet
        #'vcardRegion' : { LABEL : 'vcard_region', TYPE : TYPE_STRING },
        'vcardPostalCode': {
            LABEL: 'vcard_postalCode',
            TYPE: TYPE_STRING
        },
        'vcardCountryName': {
            LABEL: 'vcard_countryName',
            TYPE: TYPE_STRING
        },
        'vcardPostOfficeBox': {
            LABEL: 'vcard_postOfficeBox',
            TYPE: TYPE_STRING
        },
        'vcardHasGeo': {
            LABEL: 'hasGeo',
            TYPE: TYPE_STRING
        }
    }

    def log_warm_message(self, entity_id, message):
        # TODO: differentiate logfiles by date
        filename = "warn.txt"
        filepath = LanguageValidator.LOG_LOCATION + filename
        with open(filepath, 'a') as lgout:
            msg = "Warning info on processing entity " + str(
                entity_id) + ": " + str(message)
            lgout.write(msg)
            lgout.write("\n")

    # TODO: add address processing

    def __init__(self, entity_type):
        sys.path.append(os.path.join(os.path.dirname(__file__)))
        sys.path.append(
            os.path.join(os.path.dirname(__file__), 'ranking_metrics'))
        sys.path.append(
            os.path.join(os.path.dirname(__file__), 'preview_builder'))

        from pymongo import MongoClient
        #import PreviewBuilder
        #import HarvesterConfig

        self.config = HarvesterConfig()
        #TODO: remove field name and use entity type
        self.name = entity_type + 's'
        self.client = MongoClient(self.get_mongo_host())
        self.ranking_model = self.config.get_relevance_ranking_model()
        self.write_dir = ContextClassHarvester.WRITEDIR + "/" + self.ranking_model
        #TODO create working dir here, including folders for individual entities and organization type
        self.entity_type = entity_type
        self.preview_builder = PreviewBuilder.PreviewBuilder(
            self.client, entity_type)
        self.depiction_manager = DepictionManager.DepictionManager(self.config)

    def get_mongo_host(self):
        #return default mongo host, the subclasses may use the type based config (e.g. see organizations)
        return self.config.get_mongo_host()

    #def get_mongo_port (self):
    #return default mongo port, the subclasses may use the type based config (e.g. see also organizations host)
    #return self.config.get_mongo_port()

    def get_entity_count(self):
        entities = self.client.get_database(
            HarvesterConfig.DB_ENRICHMENT).get_collection(
                HarvesterConfig.COL_ENRICHMENT_TERM).find({
                    'entityType':
                    self.entity_type.upper(),
                    EnrichmentEntity.ENTITY_ID: {
                        '$regex': 'http://data.europeana.eu/.*'
                    }
                }).count()
        return entities

    def build_entity_chunk(self, start):
        #TODO rename variables, places-> entity
        entities = self.client.get_database(
            HarvesterConfig.DB_ENRICHMENT).get_collection(
                HarvesterConfig.COL_ENRICHMENT_TERM).find(
                    {
                        'entityType': self.entity_type.upper(),
                        EnrichmentEntity.ENTITY_ID: {
                            '$regex': 'http://data.europeana.eu/.*'
                        }
                    }, {
                        EnrichmentEntity.ENTITY_ID: 1,
                        '_id': 0
                    })[start:start + ContextClassHarvester.CHUNK_SIZE]

        entities_chunk = {}
        for entity in entities:
            entity_id = entity[EnrichmentEntity.ENTITY][EnrichmentEntity.ABOUT]
            entities_chunk[entity_id] = self.client.get_database(
                HarvesterConfig.DB_ENRICHMENT).get_collection(
                    HarvesterConfig.COL_ENRICHMENT_TERM).find_one(
                        {EnrichmentEntity.ENTITY_ID: entity_id})
        return entities_chunk

    def extract_numeric_id(self, entity_id):
        parts = entity_id.split("/")
        #numeric id is the last part of the URL
        return parts[len(parts) - 1]

    def build_solr_doc(self, entities, start, one_entity=False):
        from xml.etree import ElementTree as ET

        docroot = ET.Element('add')
        for entity_id, values in entities.items():
            print("processing entity:" + entity_id)
            self.build_entity_doc(docroot, entity_id, values)
        self.client.close()
        return self.write_to_file(docroot, start, one_entity)

    def build_entity_doc(self, docroot, entity_id, entity_rows):
        #sys.path.append('ranking_metrics')
        from xml.etree import ElementTree as ET
        doc = ET.SubElement(docroot, 'doc')
        self.add_field(doc, 'id', entity_id)
        #self.add_field(doc, 'internal_type', 'Place')
        self.add_field(doc, 'internal_type', self.entity_type.capitalize())
        self.process_created_modified_timestamps(doc, entity_rows)
        self.process_representation(doc, entity_id, entity_rows)

    def add_field_list(self, docroot, field_name, values):
        if (values is None):
            return
        for value in values:
            self.add_field(docroot, field_name, value)

    def add_field(self, docroot, field_name, field_value):
        from xml.etree import ElementTree as ET

        f = ET.SubElement(docroot, 'field')
        f.set('name', field_name)
        try:
            f.text = self.sanitize_field(field_value)
        except Exception as ex:
            print(str(field_name) + "!" + str(field_value) + str(ex))

    def sanitize_field(self, field_value):
        field_value = field_value.replace("\n", " ")
        field_value = field_value.replace("\\n", " ")
        field_value = field_value.replace("\t", " ")
        return field_value

    def write_to_file(self, doc, start, one_entity):
        from xml.etree import ElementTree as ET
        from xml.dom import minidom
        import io
        writepath = self.get_writepath(start, one_entity)
        roughstring = ET.tostring(doc, encoding='utf-8')
        reparsed = minidom.parseString(roughstring)
        reparsed = reparsed.toprettyxml(encoding='utf-8',
                                        indent="     ").decode('utf-8')
        with io.open(writepath, 'w', encoding='utf-8') as writefile:
            writefile.write(reparsed)
            writefile.close()
        return writepath

    def get_writepath(self, start, one_entity):
        if (one_entity):
            return self.write_dir + "/individual_entities/" + self.name + "/" + str(
                start) + ".xml"
        else:
            return self.write_dir + "/" + self.name + "/" + self.name + "_" + str(
                start) + "_" + str(start +
                                   ContextClassHarvester.CHUNK_SIZE) + ".xml"

    def grab_relevance_ratings(self, docroot, entity_id, entity):
        metrics_record = self.relevance_counter.get_raw_relevance_metrics(
            entity)
        eu_enrichments = metrics_record.uri_hits
        eu_terms = metrics_record.term_hits
        pagerank = metrics_record.pagerank
        if (self.ranking_model ==
                self.config.HARVESTER_RELEVANCE_RANKING_MODEL_DEFAULT):
            ds = self.relevance_counter.calculate_relevance_score(
                entity_id, pagerank, eu_enrichments, eu_terms)
        elif (self.ranking_model ==
              self.config.HARVESTER_RELEVANCE_RANKING_MODEL_NORMALIZED):
            ds = self.relevance_counter.calculate_normalized_score(
                pagerank, eu_enrichments, eu_terms)
        else:
            raise ValueError(
                "Must set property harvester.relevance.ranking.model to one of the values <default> or <normalized>"
            )
        self.add_field(docroot, 'europeana_doc_count', str(eu_enrichments))
        self.add_field(docroot, 'europeana_term_hits', str(eu_terms))
        self.add_field(docroot, 'pagerank', str(pagerank))
        self.add_field(docroot, 'derived_score', str(ds))
        self.add_suggest_filters(docroot, eu_enrichments)
        return True

    def grab_isshownby(self, docroot, web_resource):
        if (web_resource is not None):
            self.add_field(docroot, 'isShownBy', web_resource.media_url)
            self.add_field(docroot, 'isShownBy.source',
                           web_resource.europeana_item_id)
            self.add_field(docroot, 'isShownBy.thumbnail',
                           web_resource.thumbnail_url)

    def process_address(self, docroot, entity_id, address):
        #TODO check if the full address is needed
        #address_components = []
        for k, v in address.items():
            key = k
            value = v
            #about is not an ignored property for address
            if ("about" == k):
                key = "address_" + k
            elif ("vcardHasGeo" == k):
                #remove geo:, keep just lat,long
                value = v.split(":")[-1]

            if (self.is_ignored_property(key)):
                #ignored properties are not mapped to solr document
                continue

            if (key not in ContextClassHarvester.FIELD_MAP.keys()):
                self.log_warm_message(entity_id, "unmapped field: " + key)
                continue

            field_name = ContextClassHarvester.FIELD_MAP[key][self.LABEL]
            if ("vcardHasGeo" != k):
                field_name = field_name + ".1"

            self.add_field(docroot, field_name, value)
            #address_components.append(v)

    def process_created_modified_timestamps(self, docroot, entity_rows):
        # Solr time format YYYY-MM-DDThh:mm:ssZ
        if "created" in entity_rows:
            self.add_field(docroot, 'created',
                           entity_rows["created"].isoformat() + "Z")
        #"modified" changed to updated in the database
        if "updated" in entity_rows:
            self.add_field(docroot, "modified",
                           entity_rows["updated"].isoformat() + "Z")

    def is_ignored_property(self, characteristic):
        return str(characteristic) in self.IGNORED_PROPS

    def process_representation(self, docroot, entity_id, entity):
        #all pref labels
        all_preflabels = []
        for characteristic in entity[EnrichmentEntity.REPRESENTATION]:
            if (self.is_ignored_property(characteristic)):
                continue
            elif (str(characteristic)
                  not in ContextClassHarvester.FIELD_MAP.keys()):
                # TODO: log this?
                print("unmapped property: " + str(characteristic))
                continue
            elif (characteristic == "address"):
                self.process_address(
                    docroot, entity_id,
                    entity[EnrichmentEntity.REPRESENTATION]['address'])
            # TODO: Refactor horrible conditional
            elif (str(characteristic) == "dcIdentifier"):
                self.add_field_list(
                    docroot, EnrichmentEntity.DC_IDENTIFIER,
                    entity[EnrichmentEntity.REPRESENTATION]['dcIdentifier'][
                        EnrichmentEntity.LANG_DEF])
            elif (str(characteristic) == "edmOrganizationDomain"):
                #TODO: create method to add solr field for .en fields
                self.add_field(
                    docroot, EnrichmentEntity.ORGANIZATION_DOMAIN + "." +
                    EnrichmentEntity.LANG_EN,
                    entity[EnrichmentEntity.REPRESENTATION]
                    ['edmOrganizationDomain'][EnrichmentEntity.LANG_EN])
            elif (str(characteristic) == "edmEuropeanaRole"):
                #multivalued
                roles = entity[EnrichmentEntity.REPRESENTATION][
                    'edmEuropeanaRole'][EnrichmentEntity.LANG_EN]
                self.add_field_list(
                    docroot, EnrichmentEntity.EUROPEANA_ROLE + "." +
                    EnrichmentEntity.LANG_EN, roles)
            elif (str(characteristic) == "edmGeographicLevel"):
                self.add_field(
                    docroot, EnrichmentEntity.GEOGRAPHIC_LEVEL + "." +
                    EnrichmentEntity.LANG_EN,
                    entity[EnrichmentEntity.REPRESENTATION]
                    ['edmGeographicLevel'][EnrichmentEntity.LANG_EN])
            elif (str(characteristic) == "edmCountry"):
                self.add_field(
                    docroot, EnrichmentEntity.COUNTRY,
                    entity[EnrichmentEntity.REPRESENTATION]['edmCountry'][
                        EnrichmentEntity.LANG_EN])
            elif (str(characteristic) == "begin"):
                #pick first value from default language for timestamps, need to check for agents
                self.add_field(
                    docroot, EnrichmentEntity.EDM_BEGIN,
                    entity[EnrichmentEntity.REPRESENTATION]['begin'][
                        EnrichmentEntity.LANG_DEF][0])
            elif (str(characteristic) == "end"):
                #pick first value from default language for timestamps, need to check for agents
                self.add_field(
                    docroot, EnrichmentEntity.EDM_END,
                    entity[EnrichmentEntity.REPRESENTATION]['end'][
                        EnrichmentEntity.LANG_DEF][0])
            elif (type(entity[EnrichmentEntity.REPRESENTATION][characteristic])
                  is dict):
                # hiddenLabels are currenlty used only for Timespans
                if (str(characteristic) == "hiddenLabel"
                        and self.ignore_hidden_label()):
                    continue

                #for each entry in the language map
                for lang in entity[
                        EnrichmentEntity.REPRESENTATION][characteristic]:
                    pref_label_count = 0
                    #avoid duplicates when adding values from prefLabel
                    prev_alts = []
                    if (ContextClassHarvester.LANG_VALIDATOR.
                            validate_lang_code(entity_id, lang)):
                        field_name = ContextClassHarvester.FIELD_MAP[
                            characteristic][self.LABEL]
                        field_values = entity[EnrichmentEntity.REPRESENTATION][
                            characteristic][lang]
                        #property is language map of strings
                        if (type(field_values) == str):
                            lang_code = lang if lang != EnrichmentEntity.LANG_DEF else ''
                            q_field_name = field_name + "." + lang_code
                            #field value = field_values
                            self.add_field(docroot, q_field_name, field_values)
                        else:
                            #for each value in the list
                            for field_value in field_values:
                                q_field_name = field_name
                                lang_code = lang if lang != EnrichmentEntity.LANG_DEF else ''
                                if (ContextClassHarvester.
                                        FIELD_MAP[characteristic][
                                            self.TYPE] == self.TYPE_STRING):
                                    q_field_name = field_name + "." + lang_code
                                # Code snarl: we often have more than one prefLabel per language in the data
                                # We can also have altLabels
                                # We want to shunt all but the first-encountered prefLabel into the altLabel field
                                # while ensuring the altLabels are individually unique
                                # TODO: Refactor (though note that this is a non-trivial refactoring)
                                # NOTE: prev_alts are for one language, all_preflabels include labels in any language
                                if (characteristic == 'prefLabel'
                                        and pref_label_count > 0):
                                    #move all additional labels to alt label
                                    q_field_name = "skos_altLabel." + lang_code
                                    #SG - TODO: add dropped pref labels to prev_alts??
                                    #prev_alts.append(field_value)
                                if ('altLabel' in q_field_name):
                                    #TODO: SG why this? we skip alt labels here, but we don't add the gained entries from prefLabels

                                    if (field_value in prev_alts):
                                        continue
                                    prev_alts.append(field_value)
                                    #suggester uses alt labels for some entity types (organizations)
                                    #disables until altLabels are added to payload
                                    #self.add_alt_label_to_suggest(field_value, all_preflabels)
                                if (str(characteristic) == "edmAcronym"):
                                    #suggester uses alt labels for some entity types (organizations)
                                    self.add_acronym_to_suggest(
                                        field_value, all_preflabels)

                                if (characteristic == 'prefLabel'
                                        and pref_label_count == 0):
                                    pref_label_count = 1
                                    #TODO: SG - the suggester could actually make use of all pref labels, but the hightlighter might crash
                                    all_preflabels.append(field_value)

                                #add field to solr doc
                                self.add_field(docroot, q_field_name,
                                               field_value)
            #property is list
            elif (type(entity[EnrichmentEntity.REPRESENTATION][characteristic])
                  is list):
                field_name = ContextClassHarvester.FIELD_MAP[characteristic][
                    self.LABEL]
                for entry in entity[
                        EnrichmentEntity.REPRESENTATION][characteristic]:
                    self.add_field(docroot, field_name, entry)
            # property is a single value
            else:
                try:
                    field_name = ContextClassHarvester.FIELD_MAP[
                        characteristic][self.LABEL]
                    field_value = entity[
                        EnrichmentEntity.REPRESENTATION][characteristic]
                    self.add_field(docroot, field_name, str(field_value))
                except KeyError as error:
                    print(
                        'Attribute found in source but undefined in schema.' +
                        str(error))

        #add suggester payload
        web_resource = self.depiction_manager.get_depiction(entity_id)
        self.grab_isshownby(docroot, web_resource)
        payload = self.build_payload(entity_id, entity, web_resource)
        self.add_field(docroot, 'payload', json.dumps(payload))
        #add suggester field
        all_preflabels = self.shingle_preflabels(all_preflabels)
        # SG: values in the same language are joined using space separator. values in different languages are joined using underscore as it is used as tokenization pattern. see schema.xml
        self.add_field(docroot, 'skos_prefLabel',
                       "_".join(sorted(set(all_preflabels))))
        depiction = self.preview_builder.get_depiction(entity_id)
        if (depiction):
            self.add_field(docroot, 'foaf_depiction', depiction)

        self.grab_relevance_ratings(docroot, entity_id, entity)

    def shingle_preflabels(self, preflabels):
        shingled_labels = []
        for label in preflabels:
            all_terms = label.split()
            for i in range(len(all_terms)):
                shingle = " ".join(all_terms[i:len(all_terms)])
                shingled_labels.append(shingle)
        return shingled_labels

    def build_payload(self, entity_id, entity_rows, web_resource):
        payload = self.preview_builder.build_preview(
            self.entity_type, entity_id,
            entity_rows[EnrichmentEntity.REPRESENTATION], web_resource)
        return payload

    def add_suggest_filters(self, docroot, enrichment_count):
        self.add_field(docroot, 'suggest_filters',
                       self.entity_type.capitalize())
        if (enrichment_count > 0):
            self.add_field(docroot, 'suggest_filters', 'in_europeana')

    def suggest_by_alt_label(self):
        #this functionality can be activated by individual harvesters
        return False

    def suggest_by_acronym(self):
        #this functionality can be activated by individual harvesters
        return False

    def add_alt_label_to_suggest(self, value, suggester_values):
        if (self.suggest_by_alt_label() and (value not in suggester_values)):
            suggester_values.append(value)

    def add_acronym_to_suggest(self, value, suggester_values):
        if (self.suggest_by_acronym() and (value not in suggester_values)):
            suggester_values.append(value)

    def ignore_hidden_label(self):
        return True
示例#32
0
from pymongo import MongoClient
conn = MongoClient('localhost', 27017)
db = conn.stu
myset = db.class4
# myset.insert({'name':'张铁林','king':'乾隆'})
# myset.insert({'name':'张国立','king':'康熙'},{'name':'陈道明','king':'康熙'})
# myset.insert_many([{'name':'唐国强','king':'雍正'},{'name':'陈建斌','king':'雍正'}])
# cursor = myset.find({},{'_id':0})
# print(cursor)
# for i in cursor:
#     print(i)
# myset1 = db.class0
# cursor = myset1.find({'$or':[{'sex':'w'},{'age':{'$gt':19}}]})
# for i in cursor:
#     print(i)
myset.remove({'gender': 'null'})
conn.close()
示例#33
0
class Mongo(Database):
    EXEC_COUNT_COL = "exec_count"
    PREV_HTML_COL = "prev_html"
    PREV_DIFF_COL = "prev_diff"

    def __init__(self, setting):
        self.setting = setting
        username = urllib.parse.quote_plus(setting['username'])
        password = urllib.parse.quote_plus(setting['password'])
        self.client = MongoClient('mongodb://%s:%s@%s:%s/%s' %
                                  (username, password, setting['hostname'],
                                   setting['port'], setting['database']))
        self.db = self.client[setting['database']]
        if not Mongo.EXEC_COUNT_COL in self.db.list_collection_names():
            self.db.create_collection(Mongo.EXEC_COUNT_COL)
            self.db[Mongo.EXEC_COUNT_COL].insert_one({"count": 1, "id": 1})
        if not Mongo.PREV_HTML_COL in self.db.list_collection_names():
            self.db.create_collection(Mongo.PREV_HTML_COL)
            self.db[Mongo.PREV_HTML_COL].insert_one({"html": "", "id": 1}, )
        if not Mongo.PREV_DIFF_COL in self.db.list_collection_names():
            self.db.create_collection(Mongo.PREV_DIFF_COL)

    def __del__(self):
        self.client.close()

    def insert(self, prev_html, prev_diff):
        self.update_previous_html(prev_html)
        self.db[Mongo.PREV_DIFF_COL].insert_one(prev_diff)

    def drop(self):
        self.db.drop_collection(Mongo.EXEC_COUNT_COL)
        self.db.drop_collection(Mongo.PREV_HTML_COL)
        self.db.drop_collection(Mongo.PREV_DIFF_COL)

    def get_exec_count(self):
        count_data = self.db[Mongo.EXEC_COUNT_COL].find_one()
        if not count_data:
            self.db[Mongo.EXEC_COUNT_COL].insert_one({"count": 1, "id": 1})
            return 1
        return count_data["count"]

    def update_exec_count(self):
        prev_count = int(self.db[Mongo.EXEC_COUNT_COL].find_one()["count"])
        self.db[Mongo.EXEC_COUNT_COL].find_one_and_update(
            {"id": 1}, {'$set': {
                "count": prev_count + 1
            }})

    def get_previous_html(self):
        html_data = self.db[Mongo.PREV_HTML_COL].find_one()
        return html_data["html"]

    def update_previous_html(self, new_html):
        self.db[Mongo.PREV_HTML_COL].find_one_and_update(
            {"id": 1}, {'$set': {
                "html": new_html
            }})

    def find_diff_from_previous(self, target):
        return self.db[Mongo.PREV_DIFF_COL].find_one({"diff": target})

    def _get_previous_diff_max_id(self):
        max_id = 0
        results = self.db[Mongo.PREV_DIFF_COL].find().sort('id',
                                                           DESCENDING).limit(1)
        for c in results:
            if max_id < int(c["id"]):
                max_id = int(c["id"])
        return max_id

    def insert_or_update_diff(self, diff):
        exist_diff = self.find_diff_from_previous(diff)
        if exist_diff:
            self.db[Mongo.PREV_DIFF_COL].find_one_and_update(
                {"diff": exist_diff["diff"]},
                {'$set': {
                    "count": int(exist_diff["count"]) + 1
                }})
            return self.find_diff_from_previous(diff)
        diff_id = self._get_previous_diff_max_id() + 1
        new_record = {"diff": diff, "id": diff_id, "count": 1}
        self.db[Mongo.PREV_DIFF_COL].insert_one(new_record)
        return self.find_diff_from_previous(diff)

    def insert_previous_diff(self, diff):
        diff_id = 1 + self._get_previous_diff_max_id()
        self.db[Mongo.PREV_DIFF_COL].insert_one({
            "id": diff_id,
            "diff": diff,
            "count": 1
        })

    def update_previous_diff(self, target):
        target_diff = self.find_diff_from_previous(target)
        self.db[Mongo.PREV_DIFF_COL].update_one(
            {"diff": target},
            {'$set': {
                "count": int(target_diff["count"]) + 1
            }})
示例#34
0
class Batch:
    def __init__(self, programme, branch, section, year_of_pass):
        # THIS CONSTRUCTOR IS USED TO CREATE A REQUIRED COLLECTION
        # IN DATABASE. FOR EXAMPLE :- BATCH_BTECH_CSE_A_2021
        #
        self._programme = programme
        self._branch = branch
        self._section = section
        self._year_of_pass = year_of_pass
        try:
            self.client = MongoClient(config.MongoDB_URI)
            db = self.client[config.Batch_DB]
            log(f'[  INFO  ] {config.Batch_DB} Connected Successfully')
        except:
            log(f'[  ERROR ] Unable To Create Connection With {config.Batch_DB}'
                )
        self.collection = db[f'{programme}_{branch}_{section}_{year_of_pass}']

    def insert(self, enrollment):
        # USED TO INSERT ENROLLMENT OF A STUDENT IN THE REQUIRED COLLECTION
        # ---------------------------------------------------------------------------
        # DATA STRUCTURES OF ENROLLED_STUDENTS :-
        # ENROLLMENT --> STRING
        #
        # CHECKING FOR ANY DUPLICATE ENTRY IN THE COLLECTION
        duplicate_entry = self.collection.find_one({'enrollment': enrollment})
        if duplicate_entry != None:
            log(f'[  ERROR ] {enrollment} Enrollment Insertion at {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB failed - Duplicate Entry Found'
                )
            return 417
        else:
            status = self.collection.insert_one({'enrollment': enrollment})
            log(f'[  INFO  ] {status}')
            log(f'[  INFO  ] {enrollment} Enrollment Inserted Successfully at {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in {config.Batch_DB}'
                )
            return 201

    def remove(self, enrollment):
        # USED TO REMOVE ENROLLMENT OF A PARTICULAR STUDENT FROM BATCH COLLECTION
        # ----------------------------------------------------------------------------
        # DATA STRUCTURES OF INPUT PARAMETER :-
        # ENROLLMENT --> STRING
        #
        try:
            status = self.collection.delete_one({'enrollment': enrollment})
            log(f'[  INFO  ] {status}')
            log(f'[  INFO  ] {enrollment} Enrollment Removed From {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB'
                )
            return 220
        except:
            return 203
            log(f'[  ERROR ] Unable To Remove {enrollment} Enrollment From {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection in Batch_DB'
                )

    def remove_all(self):
        # USED TO REMOVE WHOLE COLLECTION FOR WHICH BATCH CLASS
        # OBJECT IS INITIALISED.
        # ----------------------------------------------------------------------------
        #
        try:
            self.collection.drop()
            log(f'[  INFO  ] {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection Removed From Batch_DB'
                )
            return 512
        except:
            log(f'[  ERROR ] Unable To Remove {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection From Batch_DB'
                )
            return 400

    def show_all(self):
        # USED TO DISPLAY A LIST OF ALL THE ENROLLED STUDENTS IN A CLASS
        try:
            res = self.collection.find({})
            if res.count() > 0:
                response = {'status': 302, 'res': res}
            else:
                response = {'status': 302, 'res': {}}
            log(f'[  INFO  ] {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection Fetched From Batch_DB'
                )
        except:
            response = {'status': 598, 'res': None}
            log(f'[  ERROR ] Unable To Fetch {self._programme}_{self._branch}_{self._section}_{self._year_of_pass} Collection From Batch_DB'
                )
        return response

    def __del__(self):
        # log('[  INFO  ] Connection closed successfully of Batch_DB.')
        self.client.close()  # RELEASING OPEN CONNECTION WITH DATABASE
示例#35
0
from pymongo import MongoClient

client = MongoClient(host='localhost', port=27017)
db = client['text']  # 数据库名字

db['inventory'].delete_one({})
# Delete operations do not drop indexes, even if deleting all documents from a collection
db['inventory'].delete_many({})

db['inventory'].remove()
client.close()
示例#36
0
class Marksheet:
    def __init__(self, faculty_id, subject, programme, branch, section,
                 year_of_pass, semester):
        # Constructor of marksheet accepts the following parameters:
        # faculty_id --> Unique Id of faculty --> string
        # subject --> subject name taught by the given faculty --> string
        # programme --> programme of the class whose marks are provided here --> string
        # branch --> like cse, IT etc --> string
        # section --> string
        # year_of_pass --> string
        # semester --> string
        #
        # Creating a collection in database with identifier like
        # "037_maths_btech_cse_a_2021_4".
        #
        try:
            self.client = MongoClient(config.MongoDB_URI)
            db = self.client[config.Marksheet_DB]
            log('[ INFO  ] Marksheet_DB Connected Successfully')
        except:
            log('[ Error ] Unable To Create Connection With Marksheet_DB')
            sys.exit(0)
        self.collection = db[
            f'{faculty_id}_{subject}_{programme}_{branch}_{section}_{year_of_pass}_{semester}']

    def insert(self, marksheet_dictionary):
        # inserts marksheet dictionary that contains enrollment, marks and assessment
        # if it already isn't available in db.
        # -----------------------------------------------------------------------------------
        # for example :
        # marksheet_dictionary = {
        #							'enrollment':'03720802717',
        #							'marks' :  '29',
        #							'assessment':'8'
        #						}
        #
        duplicate_entry = self.collection.find_one(
            {'enrollment': marksheet_dictionary['enrollment']})
        if duplicate_entry != None:
            log('[ Error ] Object of this Enrollment Number already present in Database'
                )
            return 417
        else:
            status = self.collection.insert_one(marksheet_dictionary)
            log(f'[ INFO  ] {status}')  # Printing Status of result of query
            log('[ INFO  ] Marks of the enrollment number inserted in Marksheet_DB.'
                )
            return 201

    def show_of(self, enrollment):
        # This method inputs enrollment and returns marks of that particular enrollment.
        # -------------------------------------------------------------------------------
        # Data Structures of input parameter :-
        # enrollment --> string
        #
        try:
            res = list(self.collection.find({'enrollment': enrollment}))
            log('[ INFO  ] Marks of the enrollment has been displayed.')
            response = {'status': '202', 'res': res}
        except:
            response = {'status': '404', 'res': 'NA'}
        return response

    def show_all(self):
        # This method doesn't takes any input and returns marks of all students.
        # -------------------------------------------------------------------------------
        #
        try:
            res = list(self.collection.find({}))
            log('[ INFO  ] Marks of all the students has been successfully displayed.'
                )
            response = {'status': '302', 'res': res}
        except:
            response = {'status': '598', 'res': 'NA'}
        return response

    def remove(self, enrollment):
        # This method removes the collection of marks of a particular
        # enrollment from the class.
        # ----------------------------------------------------------------------------------
        # Data Structures of input parameter :-
        # enrollment --> string
        #
        try:
            status = self.collection.delete_many({'enrollment': enrollment})
            log(f'[ INFO  ] {status}')  # Printing Status of result of query
            log('[ INFO  ] Marks of particular enrollment has been removed.')
            return 220
        except:
            return 203

    def update(self, enrollment, marksheet_dictionary):
        # This method use to update marks of a particular enrollment
        # with marksheet_dictionary object
        # --------------------------------------------------------------------------------
        # Data Structures of the input parameters :-
        # enrollment --> string
        # marksheet_dictionary --> dictionary
        #
        searching_values = {'enrollment': enrollment}
        updation_value = marksheet_dictionary
        try:
            status = self.collection.update_many(searching_values,
                                                 {'$set': updation_value})
            log(f'[ INFO  ] {status}')  # Printing Status of result of query
            log('[ INFO  ] Marksheet_DB has been updated.')
            return 301
        except:
            return 204

    def __del__(self):
        self.client.close()
                min_profit_ratio_arr[cur_index] = profit_ratio
            # 累加利润值
            profit_ratio_arr[cur_index] += profit_ratio

    # 画图
    # 求profit平均值
    for index in range(len(profit_ratio_arr)):
        profit_ratio_arr[index] = round(profit_ratio_arr[index] / (buck_num+1), 2)
    plt.xlabel('x')
    plt.ylabel('y')
    plt.title(u'value ratio_for_%s, 单个总数:%s' % (buy_result, limit_num), fontproperties=font_set)
    plt.plot(index_arr, profit_ratio_arr)
    # 横坐标描述
    plt.xlabel(u'主客身价比', fontproperties=font_set)
    # 纵坐标描述
    plt.ylabel(u'利润', fontproperties=font_set)
    for a, b in zip(index_arr, profit_ratio_arr):
        plt.text(a, b, b, ha='center', va='bottom', fontsize=10)
    # plt.xticks(index_arr)
    plt.legend()
    # 展示最低利润值
    print('最低利润为:\n')
    for i in range(0, len(min_profit_ratio_arr)):
        print('%.2f, ' % min_profit_ratio_arr[i])
    plt.show()

except Exception as err:
    print('%s\n%s' % (err, traceback.format_exc()))
finally:
    mongo_client.close()
示例#38
0
class YunqicrawlPipeline(object):

    # def __init__(self, mongo_uri, mongo_db, replicaset):
    #     self.mongo_uri = mongo_uri
    #     self.mongo_db = mongo_db
    #     self.replicaset = replicaset
    #
    # @classmethod
    # def from_crawler(cls, crawler):
    #     return cls(mongo_uri=crawler)
    def open_spider(self, spider):
        self.client = MongoClient()
        self.collection = self.client["yunqi"]["book"]

    def close_spider(self, spider):
        self.client.close()

    def process_item(self, item, spider):
        if isinstance(item, YunqiBookListItem):
            # pass
            # self._precess_booklist_item(item)
            # else:
            # print("==================")
            # print(item)
            self._precess_bookeDetail_item(item)
        return item

    def _precess_booklist_item(self, item):
        """
        处理小说信息
        :param item:
        :return:
        """
        self.collection.insert(dict(item))

    def _precess_bookeDetail_item(self, item):
        """
        处理小说热度
        :param item:
        :return:
        """
        pattern = re.compile("\d+")

        item["novelLabel"] = item["novelLabel"].strip().replace("\n", "")

        match = pattern.search(item["novelAllClick"])
        item["novelAllClick"] = match.group(
        ) if match else item["novelAllClick"]

        match = pattern.search(item["novelMonthClick"])
        item["novelMonthClick"] = match.group(
        ) if match else item["novelMonthClick"]

        match = pattern.search(item["novelWeekClick"])
        item["novelWeekClick"] = match.group(
        ) if match else item["novelWeekClick"]

        match = pattern.search(item["novelAllPopular"])
        item["novelAllPopular"] = match.group(
        ) if match else item["novelAllPopular"]

        match = pattern.search(item["novelMonthPopular"])
        item["novelMonthPopular"] = match.group(
        ) if match else item["novelMonthPopular"]

        match = pattern.search(item["novelWeekPopular"])
        item["novelWeekPopular"] = match.group(
        ) if match else item["novelWeekPopular"]

        match = pattern.search(item["novelAllComm"])
        item["novelAllComm"] = match.group() if match else item["novelAllComm"]

        match = pattern.search(item["novelMonthComm"])
        item["novelMonthComm"] = match.group(
        ) if match else item["novelMonthComm"]

        match = pattern.search(item["novelWeekComm"])
        item["novelWeekComm"] = match.group(
        ) if match else item["novelWeekComm"]

        self.collection.insert(dict(item))
示例#39
0
# _*_ coding:utf-8 _*_
# !/usr/bin/python

from pymongo import MongoClient

mc = MongoClient('localhost', 27017)  # 连接数据库
db = mc.mydb  # use mydb数据库
db.user.save({'name': '张三', 'age': 90})  # 将记录写入表
# 查询记录
print([d for d in db.user.find({'name': 'chengxudong'})])
data_obj = db.user.find()
for o in data_obj:
    print(o)
print([d for d in data_obj])
mc.close()  # 关闭与数据库连接
from pymongo import MongoClient

# Por padrão o host é localhost e porta 27017
# Estes valores podem ser modificados passando uma URI
# client = MongoClient("mongodb://localhost:27017/")
client = MongoClient()
db = client.catalogue
documents = [
    {
        "title": "A Light in the Attic",
    },
    {
        "title": "Tipping the Velvet",
    },
    {
        "title": "Soumission",
    },
]
db.books.insert_many(documents)
client.close()  # fecha a conexão com o banco de dados
    b = ax.bar(x, list_People, color='k', alpha=0.8, tick_label=list_name)
    #print(type(b))
    #datas = pd.Series(list_People, index=list_name)
    #c = datas.plot.bar(color='k', alpha=0.8) # 垂直柱状图
    #print(type(c))
    for i in b:
        h = i.get_height()
        ax.text(i.get_x() + i.get_width() / 2,
                h,
                '%.4f' % h,
                ha='center',
                va='bottom')
    plt.xticks(rotation=15)
    plt.xlabel("店铺名称")
    plt.ylabel("购买人数/10000")
    plt.title("销售分析")
    plt.ylim(0, 10)
    plt.show()


if __name__ == "__main__":
    global db
    global sntable
    global table
    table = 'TaoBaoLipstick'
    mconn = MongoClient("mongodb://localhost")
    db = mconn['test']
    db.authenticate('test', 'test')
    pandas_data()
    mconn.close()
示例#42
0
def test_register(request):
    if request.method == "POST":
        email = request.POST.get('email')
        email2 = request.POST.get('email2')
        username1 = request.POST.get('username1')
        username2 = request.POST.get('username2')
        pwd = request.POST.get('pwd')
        pwd2 = request.POST.get('pwd2')
        sex = request.POST.get('sex')
        organization = request.POST.get('organization')
        research = request.POST.get('research')
        title = request.POST.get('title')
        age = request.POST.get('age')
        qq = request.POST.get('qq')
        wechat = request.POST.get('wechat')
        blog = request.POST.get('blog')
        if email != email2:
            return HttpResponse('两次注册邮箱不一致')
        elif len(str(email)) == 0:
            return HttpResponse('提交失败')
        elif pwd != pwd2:
            return HttpResponse('两次密码不一致')

        else:
            __db_server, __db_port = '127.0.0.1', 27017
            client = MongoClient(__db_server, __db_port)
            db = client['userinfo']
            try:
                user = db.userinfo.find_one({'_id': email})
                if user is None:
                    db.trial1.insert_one({
                        '_id': email,
                        '姓': username1,
                        '名': username2,
                        '密码': pwd,
                        '密码2': pwd2,
                        '性别': sex,
                        '工作/学习单位': organization,
                        '专业/研究方向': research,
                        '职称': title,
                        '年龄': age,
                        'qq': qq,
                        'wechat': wechat,
                        '个人主页': blog,
                    })
                    send_register_email(email, 'register')
                    client.close()
                    return HttpResponse('邮箱验证已发送')
                else:
                    db.trial1.update_one({"_id": email}, {
                        '$set': {
                            '姓': username1,
                            '名': username2,
                            '密码': pwd,
                            '密码2': pwd2,
                            '性别': sex,
                            '工作/学习单位': organization,
                            '专业/研究方向': research,
                            '职称': title,
                            '年龄': age,
                            'qq': qq,
                            'wechat': wechat,
                            '个人主页': blog,
                        }
                    },
                                         upsert=None)
                    send_register_email(email, 'register')
                    client.close()
                    return HttpResponse('邮箱验证已发送')

            except KeyError:
                message = "该邮箱已注册"
                client.close()
                return render(request, 'login.html', {'msg': message})
    else:
        return render(request, 'register.html')
示例#43
0
class ZhihuPipeline(object):
    """
    存储数据
    """
    def __init__(self, mongo_uri, mongo_db, image_dir):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
        self.image_dir = image_dir
        self.client = None
        self.db = None

    @classmethod
    def from_crawler(cls, crawler):
        return cls(mongo_uri=MONGO_URI,
                   mongo_db='zhihu',
                   image_dir=os.path.join(PROJECT_DIR, 'images'))

    def open_spider(self, spider):
        self.client = MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
        if not os.path.exists(self.image_dir):
            os.mkdir(self.image_dir)

    def close_spider(self, spider):
        self.client.close()

    def _process_people(self, item):
        """
        存储用户信息
        """
        collection = self.db['people']
        zhihu_id = item['zhihu_id']
        collection.update({'zhihu_id': zhihu_id}, dict(item), upsert=True)

        image_url = item['image_url']
        if image_url and zhihu_id:
            image_path = os.path.join(self.image_dir,
                                      '{}.jpg'.format(zhihu_id))
            download_pic.delay(image_url, image_path)

    def _process_relation(self, item):
        """
        存储人际拓扑关系
        """
        collection = self.db['relation']

        data = collection.find_one({
            'zhihu_id': item['zhihu_id'],
            'user_type': item['user_type']
        })
        if not data:
            self.db['relation'].insert(dict(item))
        else:
            origin_list = data['user_list']
            new_list = item['user_list']
            data['user_list'] = list(set(origin_list) | set(new_list))
            collection.update(
                {
                    'zhihu_id': item['zhihu_id'],
                    'user_type': item['user_type']
                }, data)

    def process_item(self, item, spider):
        """
        处理item
        """
        if isinstance(item, ZhihuPeopleItem):
            self._process_people(item)
        elif isinstance(item, ZhihuRelationItem):
            self._process_relation(item)
        return item
示例#44
0
class Dao:
    """ Data access class."""
    def __init__(self, host: str, database: str):
        """ Create new DAO atop of MongoClient"""
        self.client = MongoClient(host)
        self.database = database
        self.POSTS = "posts"
        self.USERS = "users"

    def get_default_query(self):
        return {"partition_id": 1}

    def insert_posts(self, posts: [dict]):
        for post in posts:
            post.setdefault('partition_id', 1)
        self.client                                 \
            .get_database(name=self.database)       \
            .get_collection(name=self.POSTS)        \
            .insert_many(posts)

    def insert_users(self, users: [dict]):
        for user in users:
            user.setdefault('partition_id', 1)
        self.client                                 \
            .get_database(name=self.database)       \
            .get_collection(name=self.USERS)        \
            .insert_many(users)
        # for user in users:
        #     print(user)
        #     self.client                                 \
        #         .get_database(name=self.database)       \
        #         .get_collection(name=self.USERS)        \
        #         .insert_one(user)

    def select_user_by(self, username=None, userid=None):
        query = self.get_default_query()

        if username is not None:
            query['username'] = username

        if userid is not None:
            query['id'] = userid

        result = self.client                        \
            .get_database(name=self.database)       \
            .get_collection(name=self.USERS)        \
            .find_one(query)

        return result

    def delete_all_posts(self):
        self.client                                 \
            .get_database(name=self.database)       \
            .get_collection(self.POSTS)             \
            .delete_many(self.get_default_query())

    def delete_all_users(self):
        self.client                                 \
            .get_database(name=self.database)       \
            .get_collection(self.USERS)             \
            .delete_many(self.get_default_query())

    def close(self):
        self.client.close()
def before_scenario(context, scenario):
    client = MongoClient(context.mongo_url)
    mongo_db = client.get_database('database')
    mongo_db['collection1'].delete_many({})
    client.close()
示例#46
0
class MongoDB(object):
    """main script class"""

    # pylint: disable=too-many-instance-attributes
    def __init__(self):
        self.mongo_host = "127.0.0.1"
        self.mongo_port = 27017
        self.mongo_db = [
            "admin",
        ]
        self.mongo_user = None
        self.mongo_password = None
        self.__conn = None
        self.__dbnames = None
        self.__metrics = []

    def connect(self):
        """Connect to MongoDB"""
        if self.__conn is None:
            if self.mongo_user is None:
                try:
                    self.__conn = MongoClient(
                        'mongodb://%s:%s' % (self.mongo_host, self.mongo_port))
                except errors.PyMongoError as py_mongo_error:
                    print('Error in MongoDB connection: %s' %
                          str(py_mongo_error))
            else:
                try:
                    self.__conn = MongoClient(
                        'mongodb://%s:%s@%s:%s' %
                        (self.mongo_user, self.mongo_password, self.mongo_host,
                         self.mongo_port))
                except errors.PyMongoError as py_mongo_error:
                    print('Error in MongoDB connection: %s' %
                          str(py_mongo_error))

    def add_metrics(self, k, v):
        """add each metric to the metrics list"""
        dict_metrics = {}
        dict_metrics['key'] = k
        dict_metrics['value'] = v
        self.__metrics.append(dict_metrics)

    def print_metrics(self):
        """print out all metrics"""
        metrics = self.__metrics
        for metric in metrics:
            zabbix_item_key = str(metric['key'])
            zabbix_item_value = str(metric['value'])
            print(zabbix_item_key + ' ' + zabbix_item_value)

    def get_db_names(self):
        """get a list of DB names"""
        if self.__conn is None:
            self.connect()
        db_handler = self.__conn[self.mongo_db[0]]

        master = db_handler.command('isMaster')['ismaster']
        dict_metrics = {}
        dict_metrics['key'] = 'mongodb.ismaster'
        if master:
            dict_metrics['value'] = 1
            db_names = self.__conn.list_database_names()
            self.__dbnames = db_names
        else:
            dict_metrics['value'] = 0
        self.__metrics.append(dict_metrics)

    def get_mongo_db_lld(self):
        """print DB list in json format, to be used for
        mongo db discovery in zabbix"""
        if self.__dbnames is None:
            db_names = self.get_db_names()
        else:
            db_names = self.__dbnames
        dict_metrics = {}
        db_list = []
        dict_metrics['key'] = 'mongodb.discovery'
        dict_metrics['value'] = {"data": db_list}
        if db_names is not None:
            for db_name in db_names:
                dict_lld_metric = {}
                dict_lld_metric['{#MONGODBNAME}'] = db_name
                db_list.append(dict_lld_metric)
            dict_metrics['value'] = '{"data": ' + json.dumps(db_list) + '}'
        self.__metrics.insert(0, dict_metrics)

    def get_server_status_metrics(self):
        """get server status"""
        if self.__conn is None:
            self.connect()
        db_handler = self.__conn[self.mongo_db[0]]
        ss = db_handler.command('serverStatus')

        # db info
        self.add_metrics('mongodb.version', ss['version'])
        self.add_metrics('mongodb.storageEngine', ss['storageEngine']['name'])
        self.add_metrics('mongodb.uptime', int(ss['uptime']))
        self.add_metrics('mongodb.okstatus', int(ss['ok']))

        # asserts
        for k, v in ss['asserts'].items():
            self.add_metrics('mongodb.asserts.' + k, v)

        # operations
        for k, v in ss['opcounters'].items():
            self.add_metrics('mongodb.operation.' + k, v)

        # memory
        for k in ['resident', 'virtual', 'mapped', 'mappedWithJournal']:
            self.add_metrics('mongodb.memory.' + k, ss['mem'][k])

        # connections
        for k, v in ss['connections'].items():
            self.add_metrics('mongodb.connection.' + k, v)

        # network
        for k, v in ss['network'].items():
            self.add_metrics('mongodb.network.' + k, v)

        # extra info
        self.add_metrics('mongodb.page.faults',
                         ss['extra_info']['page_faults'])

        #wired tiger
        if ss['storageEngine']['name'] == 'wiredTiger':
            self.add_metrics(
                'mongodb.used-cache',
                ss['wiredTiger']['cache']["bytes currently in the cache"])
            self.add_metrics(
                'mongodb.total-cache',
                ss['wiredTiger']['cache']["maximum bytes configured"])
            self.add_metrics(
                'mongodb.dirty-cache',
                ss['wiredTiger']['cache']["tracked dirty bytes in the cache"])

        # global lock
        lock_total_time = ss['globalLock']['totalTime']
        self.add_metrics('mongodb.globalLock.totalTime', lock_total_time)
        for k, v in ss['globalLock']['currentQueue'].items():
            self.add_metrics('mongodb.globalLock.currentQueue.' + k, v)
        for k, v in ss['globalLock']['activeClients'].items():
            self.add_metrics('mongodb.globalLock.activeClients.' + k, v)

    def get_db_stats_metrics(self):
        """get DB stats for each DB"""
        if self.__conn is None:
            self.connect()
        if self.__dbnames is None:
            self.get_db_names()
        if self.__dbnames is not None:
            for mongo_db in self.__dbnames:
                db_handler = self.__conn[mongo_db]
                dbs = db_handler.command('dbstats')
                for k, v in dbs.items():
                    if k in [
                            'storageSize', 'ok', 'avgObjSize', 'indexes',
                            'objects', 'collections', 'fileSize', 'numExtents',
                            'dataSize', 'indexSize', 'nsSizeMB'
                    ]:
                        self.add_metrics(
                            'mongodb.stats.' + k + '[' + mongo_db + ']',
                            int(v))

    def close(self):
        """close connection to mongo"""
        if self.__conn is not None:
            self.__conn.close()
示例#47
0
class OTP:
    def __init__(self):
        try:
            self.client = MongoClient(config.MongoDB_URI)
            db = self.client[config.OTP_DB]
            log(f'[  INFO  ] {config.OTP_DB} Connected Successfully')
        except:
            log(f'[  ERROR ] Unable To Create Connection With {config.OTP_DB}')
        self.collection = db[config.OTP_COLLECTION]

    def insert(self, hash_id, otp, function):
        # USED TO INSERT OTP FOR A PARTICULAR USERID
        # ---------------------------------------------------------------------------
        # DATA STRUCTURES OF ENROLLED_STUDENTS :-
        # HASH_ID --> STRING
        # OTP --> INTEGER
        # FUNCTION --> STRING
        #
        # CHECKING THE PRESENCE OF DUPLICATE ENTRY IN DATABASE
        try:
            res = self.collection.find({'hash_id': hash_id})
            if res.count() > 0:
                ## CHECKING WHETHER SAME FUNCTIONALITY EXISTS IN THE DUPLICATE RESULTS
                for document in res:
                    if document['function'] == function:
                        log(f'[  INFO  ] For Hash ID - {hash_id} Duplicate Entry Found at {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                            )
                        status = self.collection.delete_one(
                            {'hash_id': hash_id})
                        log(f'[  INFO  ] {status}')
                        log(f'[  INFO  ] Hash_ID - {hash_id} Removed Successfully from {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                            )
            status = self.collection.insert_one({
                'hash_id': hash_id,
                'otp': otp,
                'function': function
            })
            log(f'[  INFO  ] {status}')
            log(f'[  INFO  ] For Hash_ID - {hash_id} OTP Inserted Successfully at {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                )
            return 201
        except Exception as e:
            log(f'[  ERROR ] Unable To Insert Document For User_ID - {user_id} at {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                )
            return 417

    def query(self, query_param, query_value):
        # THIS QUERY FUNCTION INPUTS QUERY PARAMETER LIKE USERID AND QUERY VALUE TO SEARCH
        # IN COLLECTION.  AFTER SUCCESSFUL SEARCH, IT RETURNS RESULT COMBINED WITH STATUS VALUE
        # -----------------------------------------------------
        # DATA STRUCTURE OF INPUT PARAMETER :-
        # QUERY_PARAMETER --> STRING
        # QUERY_VALUE --> STRING
        #
        res = self.collection.find({query_param: query_value})
        if res.count() > 0:  ## RUNS WHEN ANY RESULT COMES
            response = {'status': 212, 'res': res}
        else:
            response = {'status': 206, 'res': None}
        log(f'[  INFO  ] The Search Query Completed Successfully in {config.OTP_DB}'
            )
        return response

    def remove(self, hash_id, function):
        # USED TO REMOVE DOCUMENT CARRYING USERID AND OTP GENERATED FOR THAT USERID
        # ----------------------------------------------------------------------------
        # DATA STRUCTURES OF INPUT PARAMETER :-
        # HASH_ID --> STRING
        # FUNCTION --> STRING
        try:
            status = self.collection.delete_one({
                'hash_id': hash_id,
                'function': function
            })
            log(f'[  INFO  ] {status}')
            log(f'[  INFO  ] Hash_ID - {hash_id} Removed Successfully from {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                )
            return 220
        except:
            return 203
            log(f'[  ERROR ] Unable To Remove Hash_ID - {hash_id} from {config.OTP_COLLECTION} Collection in {config.OTP_DB}'
                )

    def __del__(self):
        self.client.close()  # RELEASING OPEN CONNECTION WITH DATABASE
示例#48
0
# This file read and push a json file in a list
# and insert it in a collection in a db.

import json
from pymongo import MongoClient

# connect to the MongoDB
connection = MongoClient("mongodb://localhost:27017/")

# connect to the UnilPlan database and the Classes collection
db = connection.db_unilplan.classes

# open .json, and make a list with them
with open('crawler/crawler/JSON_output_files/Courses.json',
          encoding='utf-8') as json_data:
    classes = {}
    classes = json.load(json_data)
    json_data.close()
    print(".json = ok")

# insert the data in the db
db.insert(classes)
print("correctly added")

# close the connection to MongoDB
connection.close()
        replhosts[r].append(hp)

if len(dbhosts) > 0:
    dbhosts = dbhosts.rstrip(',')

ctx.instance.runtime_properties['dbhosts'] = dbhosts

ctx.logger.info("Set dbhosts to ({})".format(dbhosts))

##################################################################
# Initialize replica sets
##################################################################

# for each replicaset

ctx.logger.info("replosts size:{}".format(len(replhosts)))
for k, v in replhosts.iteritems():
    ctx.logger.info("replhost:{}".format(key))
    if (len(v) > 0):
        config = {'_id': k, 'members': []}
        for i, h in enumerate(v):
            config['members'].append({'_id': i, 'host': h})
        h, p = v[0].split(":")
        c = MongoClient(h, int(p))
        ctx.logger.info("initiating replicaset:{}".format(str(config)))
        try:
            c.admin.command("replSetInitiate", config)
        except:
            pass
        c.close()
示例#50
0
class Scraper(object):
    def __init__(self,tor, zip_code):
        """
        Sets the data 
        """
        self._tor = tor
        # test tor
        self._zip_code = zip_code
        #Zillow database for housing description
        self._client = MongoClient('localhost', 27017)
        db = self._client.Zillow
        self._housing_description = db.housing_description
        
        #run the scraper
        self._get_zip_data()

    def _get_property_summary(self,soup):
        """
        Needed to change this function a little from the scaper since
        Otherwise it hangs on some functions
        Given a soup it populates the results dic and returns it
        """
        def parse_property(regex, property_):
            try:
                results[property_] = re.findall(regex, prop_summary)[0]
            except IndexError:
                results[property_] = None
        def parse_propetry2(string,property_):
            try:
                results[property_] = prop_summary.split(string)[1].split('"')[1]
            except IndexError:
                results[property_] = None

        prop_summary = soup.find("div", class_=constants.PROP_SUMMARY_CLASS)
        prop_summary = prop_summary.text
        results = {}
        parse_property(r"([\d\.]+) beds?", "bedrooms")
        parse_property(r"([\d\.]+) baths?", "bathrooms")
        parse_property(r"([\d,\.]+) sqft", "sqft")
        #these to lines don't always work it seems to hang
        #parse_property(r"((?:[A-Z]\w+ ?){1,}), [A-Z]{2}", "city")
        #parse_property(r"(?:[A-Z]\w+ ?){1,}, ([A-Z]{2})", "state")
        parse_propetry2('"city":','city')
        parse_propetry2('"state":','state')
        parse_property(r"[A-Z]{2} (\d{5}-?(?:\d{4})?)", "zipcode")
        return results

    def _get_price_tax_url(self, soup):
        """
        Given the soup of the housing details html this will find and 
        return the ajaxURL for both price history and tax history 
        """
        groups = soup.text.split('ajaxURL')
        price_history, tax_history = None, None
        for group in groups[1:-1]:
            group = group.split(";")[0]
            if 'divId:"hdp-price-history"' in group:
                price_history = "http://www.zillow.com" + group.split('"')[1:2][0]
            elif 'divId:"hdp-tax-history"' in group:
                tax_history = "http://www.zillow.com" + group.split('"')[1:2][0]
        return price_history, tax_history

    def _populate_price_and_tax_histories(self,soup, results):
        """
        Change the code a little from scrapezillow.scraper
        Given a beatifulsoup it will use tor to request the data and
        populate the price and tax history
        """

        #get price and tax urls
        price_url, tax_url = self._get_price_tax_url(soup)

        ##populate price and tax history

        html = self._tor.request(price_url)
        soup = BeautifulSoup(html)
        results["price_history"] = self._get_price_history(soup)

        html = self._tor.request(tax_url)
        soup = BeautifulSoup(html)
        results["tax_history"] = self._get_tax_history(soup)

    def _get_price_history(self,soup):
        """
        Change the code a little from scrapezillow.scraper
        Given a beatifulsoup it will populate the price history
        """
        data =[]
        try:
            table_body = soup.find('table')
            rows = table_body.find_all('tr')
            for row in rows:
                try:
                    cols = row.find_all('td')
                    cols = [ele for ele in cols]
                    date = cols[0].get_text()
                    event = cols[1].get_text()
                    price_span = cols[2].find('span')
                    if not price_span:
                        price = None
                    else:
                        price = price_span.get_text()

                    data.append([date, event, price])
                except:
                    pass # undesired data
        except:
            pass #no table found
        return data

    def _get_tax_history(self,soup):
        """
        Change the code a little from scrapezillow.scraper
        Given a beatifulsoup it will populate the tax history
        """
        data = []
        try:
            table_body = soup.find('table')
            rows = table_body.find_all('tr')
            for row in rows:
                try:
                    cols = row.find_all('td')
                    cols = [ele for ele in cols]
                    date = cols[0].get_text()
                    tax = cols[1].contents[0]
                    assessment = cols[3].get_text()

                    data.append([date, tax, assessment])
                except:
                    pass # undesired data
        except:
            pass ##No table found
        return data

    def _scrape(self,html,url):
        """
        Scrape a specific zillow home. Takes either a url or a zpid. If both/neither are
        specified this function will throw an error.
        """
        soup = BeautifulSoup(html, 'html.parser')
        results = self._get_property_summary(soup)
        results['url'] = url
        facts = scraper._parse_facts(scraper._get_fact_list(soup))
        results.update(**facts)
        results.update(**scraper._get_sale_info(soup))
        results["description"] = scraper._get_description(soup)
        results["photos"] = scraper._get_photos(soup)
        self._populate_price_and_tax_histories(soup, results)
        return results
        
        
    def _has_next(self,soup):
        """
        Looks for the Next button on the webpage to see if more houses 
        """
        if soup == None:
            return True
        return len(soup.findAll("li", { "class" : "zsg-pagination-next" }))==1
    
    def _get_house_links(self,soup):
        """
        Adds house details into the mongo database
        """
        for address in soup.findAll("dt", { "class" : "property-address" }):
            url = 'http://www.zillow.com'+address.find('a')['href']
            # Look up if already in the database
            if self._housing_description.find_one({'url':url})==None:
                try:
                    print url
                    html = self._tor.request(url)
                    self._housing_description.insert(self._scrape(html,url))
                    #sleep(1)
                except:
                    ## scrape failed
                    ## missing data so just add url to not try to add again
                    self._housing_description.insert({'url':url})
    
    def _get_zip_data(self):
        """
        Finds the housing data for the zip
        """
        soup = None
        page = 1
        print 'Zip code:',self._zip_code,' started.'
        while(self._has_next(soup)):
            url = 'http://www.zillow.com/homes/for_rent/'+str(self._zip_code)+'_rb/'+str(page)+'_p'
            r = self._tor.request(url)
            print 'Url received: ', url
            soup = BeautifulSoup(r)
            self._get_house_links(soup)
            page += 1
            sleep(1)
        print 'Zip code:',self._zip_code,' finished.'
        self._client.close()
示例#51
0
class GenerateMongo(object):
    mongo_host = "172.31.10.53"
    mongo_port = 27017
    mongo = None
    mongodb = None

    def __init__(self):
        # self.logger = logger
        self.mongo = MongoClient(host=self.mongo_host, port=self.mongo_port)
        self.mongodb = self.mongo["rap"]

    def get_req_param(self, action_id=155):
        req = self.mongodb["tb_request_parameter_list_mapping"].find(
            {"action_id": action_id}, {
                "parameter_id": 1,
                "_id": 0
            })
        req_arr = list(map(lambda x: x["parameter_id"], req))
        param = self.mongodb["tb_parameter"].find({"id": {"$in": req_arr}})
        return list(
            map(
                lambda x: {
                    "id": x["id"],
                    "name": x["name"],
                    "identifier": x["identifier"],
                    "data_type": x["data_type"]
                }, param))

    def get_res_param(self, action_id=155):
        res = self.mongodb["tb_response_parameter_list_mapping"].find(
            {"action_id": action_id}, {
                "parameter_id": 1,
                "_id": 0
            })
        # print(res)
        res_arr = list(map(lambda x: x["parameter_id"], res))
        # print(res_arr)
        param = self.mongodb["tb_parameter"].find({"id": {"$in": res_arr}})
        return list(
            map(
                lambda x: {
                    "id": x["id"],
                    "name": x["name"],
                    "identifier": x["identifier"],
                    "data_type": x["data_type"]
                }, param))

    def get_complex_param(self, complex_parameter_id=7981):
        complex = self.mongodb["tb_complex_parameter_list_mapping"].find(
            {"complex_parameter_id": complex_parameter_id}, {
                "parameter_id": 1,
                "_id": 0
            })
        complex_arr = list(map(lambda x: x["parameter_id"], complex))
        param = self.mongodb["tb_parameter"].find({"id": {"$in": complex_arr}})
        return list(
            map(
                lambda x: {
                    "id": x["id"],
                    "name": x["name"],
                    "identifier": x["identifier"],
                    "data_type": x["data_type"]
                }, param))

    def recursion_param(self, param):
        map = {}
        for p in param:
            data_type = p["data_type"]
            identifier = p["identifier"]
            if (not data_type == "object"
                    and not data_type == "array<object>"):
                if ("array<" in data_type):
                    map[identifier] = [str(p["name"])]
                else:
                    map[identifier] = p["name"]
            elif (data_type == "object"):
                sub_param = self.get_complex_param(p["id"])
                sub_map = self.recursion_param(sub_param)
                map[identifier] = sub_map
            elif (data_type == "array<object>"):
                sub_param = self.get_complex_param(p["id"])
                sub_map = self.recursion_param(sub_param)
                map[identifier] = [sub_map]
        return map

    def insert_mongo(self, doc):
        actionId = doc["actionId"]
        self.mongodb["my_rap"].remove({"actionId": actionId})
        self.mongodb["my_rap"].insert(doc)
        # self.mongodb["my_rap"].update({"actionId":actionId}, {"$set": doc}, upsert=True)

    def __del__(self):
        self.mongo.close()
示例#52
0
    def transform(self, X, **transform_params):

        # connect to db
        mongoClient = MongoClient('localhost', 27017)
        ffCorpus = mongoClient.FACTFEELCorpus
        # Sentence's table
        documentCollection = ffCorpus.documents
        temp = []

        if self.featureSetConfiguration == 0:  # not active
            temp = [[
                0
                for f in sorted(SubjectivityLexiconTransformer.features.keys())
            ] for s in X]
        elif self.featureSetConfiguration == 1:  # (weaksubj|strongsubj)-(both|neutral|positive|negative)
            for document in X:
                features_to_set = {
                    'weaksubj-both': False,
                    'weaksubj-neutral': False,
                    'weaksubj-positive': False,
                    'weaksubj-negative': False,
                    'strongsubj-both': False,
                    'strongsubj-neutral': False,
                    'strongsubj-positive': False,
                    'strongsubj-negative': False
                }
                currentSentence = documentCollection.find_one(
                    {'document_id': document})
                raw_sentence = currentSentence['raw'].lower()
                features_in_sentence = sc.analyse_sentence(raw_sentence)
                if features_in_sentence != []:
                    for feat in features_in_sentence:
                        features_to_set[feat] = True
                test = [
                    features_to_set[key]
                    for key in sorted(features_to_set.keys())
                ]
                temp.append(test)
        #TODO : 2
        elif self.featureSetConfiguration == 2:  # weak|strong
            for document in X:
                features_to_set = {'weaksubj': False, 'strongsubj': False}
                currentSentence = documentCollection.find_one(
                    {'document_id': document})
                raw_sentence = currentSentence['raw'].lower()
                features_in_sentence = sc.analyse_sentence(raw_sentence)
                if features_in_sentence != []:
                    for feat in features_in_sentence:
                        if re.findall('weaksubj', feat) != []:
                            features_to_set['weaksubj'] = True
                        elif re.findall('strongsubj', feat) != []:
                            features_to_set['strongsubj'] = True
                test = [
                    features_to_set[key]
                    for key in sorted(features_to_set.keys())
                ]
                temp.append(test)
        #TODO : 3 (polar | neutral)

        features = np.array(temp)
        #print('SubjectivityLexiconTransformer:' , self.featureSetConfiguration,' ### X:',len(X),'len(features):',len(features))
        mongoClient.close()
        return features
示例#53
0
def main():
    reload(sys)
    sys.setdefaultencoding('utf-8')

    companies = ['apple', 'google', 'samsung']
    db_names = ['twitter_apple_db', 'twitter_google_db', 'twitter_samsung_db']
    collection_names = [
        'twitter_apple_collection', 'twitter_google_collection',
        'twitter_samsung_collection'
    ]

    #output dir
    outPath = "/data/analysis_output"

    for i in range(len(companies)):
        print "Retrieving " + companies[i] + " collection from db..."

        outCSV = outPath + "/" + companies[i] + "_result.csv"

        #connect to mongo
        client = MongoClient()
        db = client[db_names[i]]
        collection = db[collection_names[i]]
        df = pd.DataFrame(list(collection.find()))

        print 'The dimension of data frame is ' + str(
            df.shape[0]) + ' x ' + str(df.shape[1])

        #print df.head(5)
        #sys.exit()

        ###################################
        #if reading from raw json files
        #opts = parse_args()
        #if opts.debug:
        #	logging.basicConfig(level=logging.DEBUG)
        #else:
        #	logging.basicConfig(level=logging.INFO)

        #get all the json files in the directory and aggregate in one df
        #fmask = os.path.join(opts.directory[0], '*.json')
        #df = get_merged_json(glob.glob(fmask),ignore_index=True)
        ###################################

        df_norm = dfCleanUp(df)
        df_norm = applyFilters(df_norm)

        #apply sentiment scoring
        print "Applying sentiment analysis..."

        compound = []
        pos = []
        neg = []
        neu = []
        for sentence in df_norm.text:
            ss = sentimentScoring(sentence)
            compound.append(ss['compound'])
            pos.append(ss['pos'])
            neg.append(ss['neg'])
            neu.append(ss['neu'])

        df_norm['ss_compound'] = compound
        df_norm['ss_pos'] = pos
        df_norm['ss_neg'] = neg
        df_norm['ss_neu'] = neu

        #add the brand company name to field
        df_norm['brand'] = companies[i]

        #print(sentence)
        #print(ss)

        #output to csv
        print "Writing to csv..."
        writeDFtoCSV(df_norm, outCSV)

        print "done."

        client.close()
示例#54
0
mongo_host = os.getenv('MONGO_HOST') or 'localhost'

logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

while True:
    mongo = MongoClient(mongo_host)
    db = mongo['okdiariocom-bot']
    delete_older_than(12, db)

    new_comment_count = 0
    post_count = 0
    for post in db['posts'].find():
        post_count = post_count + 1
        comments = get_comments(post['url'])

        for comment in comments:
            query = db['comments'].find_one({
                'comment_id': comment['comment_id'],
                'post_id': comment['post_id']
            })
            if query is None:
                if comment['posted_at'] > (time.time() - 12 * 3600):
                    logging.info('New comment: ' + comment['body'])
                    db['comments'].insert_one(comment)
                    new_comment_count = new_comment_count + 1
    mongo.close()
    logging.info(
        str(new_comment_count) + ' new comments found. ' + str(post_count) +
        ' posts processed.')
    time.sleep(13 * 60)
class MongoDB:
    def __init__(self, db_config=None, collection=None, db=None):
        global config

        self.client = None
        self.db = None
        self.collection = None
        self.connected = False

        if db_config is None:
            if os.environ.get('APP_RUNTIME_CONTEXT') == 'dev':
                db_config = config['mongo.dev']
                self.environ = 'dev'
            elif os.environ.get('APP_RUNTIME_CONTEXT') == 'qa':
                db_config = config['mongo.qa']
                self.environ = 'qa'
            else:
                db_config = config['mongo.prod']
                self.environ = 'prod'
            log.info('Using mongo.{} configuration.'.format(self.environ))
        else:
            log.info('Using db_config provided: {}'.format(db_config))

        if db_config:
            self.client = MongoClient(
                'mongodb+srv://{}:{}@{}/{}?retryWrites=true&w=majority'.format(
                    db_config['username'], db_config['password'],
                    db_config['host'], db_config['database']))

            # -- setup database
            if self.db:
                self.db = self.client[db]
            else:
                if db_config.get('database'):
                    self.db = self.client[db_config['database']]
                else:
                    self.db = self.client['admin']
            # -- setup collection
            if collection:
                self.collection = self.db[collection]
            else:
                if db_config.get('collection'):
                    self.collection = self.db[db_config['collection']]
                else:
                    self.collection = self.db['system.version']
            self.connected = True
        if self.connected:
            log.info('CONNECTED to {}@{}'.format(self.db.name,
                                                 db_config['host']))
        else:
            log.info('NOT CONNECTED. (db={}, host={})'.format(
                db_config['database'], db_config['host']))

    def close(self):
        if self.status():
            self.client.close()
            self.connected = False
            log.info('DISCONNECTED.')

    def status(self):
        r = False
        if self.client.server_info():
            if isinstance(self.db.name, str):
                r = self.connected = True
        return r
class MongoAnalysis(object):
    def __init__(self, tbname=None, saved_file_type=None):

        if tbname is None:
            raise ValueError("Not get a tbname!")
        self.tbname = tbname
        self.title = self.get_title()
        self.conn = MongoClient("localhost:27017", connect=True)
        self.db = self.conn['DBMovie']
        self.collection = self.db[self.tbname]
        self.style = Style(title_color='#fff',
                           title_pos="center",
                           width=1200,
                           height=600,
                           background_color='#404a59')
        self.saved_file_type = saved_file_type
        self.path = './img/{}/finished'.format(self.tbname)
        if not os.path.exists(self.path):
            os.makedirs(self.path)

    def get_title(self):
        '''
        get the title of the movie
        :return:
        '''
        search_number = re.compile(r"\d+").findall(self.tbname)[0]
        title = GetMvInfo().get_title(search_number)
        if not title:
            title = self.tbname
        return title

    def GetOneCol(self, name, method=None):
        '''
        give a name to search the column name in mongodb
        :param name: colname,such as "comment_content".
        :param method:
        if method is None,we should remove the null data.
        if method is 'average',we should fill the null data with the mean value.
        :return: nonempty set
        '''
        if method is None:
            return [
                comments[name].strip() for comments in self.collection.find()
                if comments[name] is not None
            ]
        elif method == "average":
            com_lst = [
                comments[name].strip() for comments in self.collection.find()
                if comments[name] is not None
            ]
            aver = reduce(lambda x, y: x + y, map(int, com_lst)) / len(com_lst)
            result = []
            for comments in self.collection.find():
                if comments[name]:
                    result.append(int(comments[name].strip()))
                else:
                    result.append(aver)
            return result

    def AreaMap(self):
        '''
        :return: a area map on chinese map
        downloaded maps:
        pip install echarts-countries-pypkg
        pip install echarts-china-provinces-pypkg
        pip install echarts-china-cities-pypkg
        pip install echarts-china-counties-pypkg
        pip install echarts-china-misc-pypkg
        pip install echarts-china-kingdom-pypkg
        '''
        # filter other countries' users
        city = dict(Counter(self.GetOneCol(name="city")))
        filter_city = {
            key: city[key]
            for key in city.keys()
            if re.compile(r'[\u4e00-\u9fa5]+').search(key[0])
        }
        key_map = [
            "河北", "山西", "辽宁", "吉林", "黑龙江", "江苏", "浙江", "安徽", "福建", "江西", "山东",
            "河南", "湖北", "湖南", "广东", "海南", "四川", "贵州", "云南", "陕西", "甘肃", "青海",
            "台湾"
        ]
        k_lst, v_lst = [], []
        for key in sorted(list(filter_city.keys())):
            v_lst.append(filter_city[key])
            if "," in key:
                key = key.split(",")[0].strip()
            for province in key_map:
                if province in key:
                    key = key.replace(province, "").strip()
            k_lst.append(key)
        v_max = max(v_lst)
        geo = Geo(self.title, "数据来源:豆瓣电影", **self.style.init_style)
        geo.add(
            "",
            k_lst,
            v_lst,
            type='effectScatter',  #other styles:scatter or heatmap
            visual_range=[0, v_max],
            visual_range_text="#fff",
            symbol_size=15,
            is_visualmap=True)
        if self.saved_file_type is None:
            geo.render(os.path.join(self.path, "AreaMap.png"))
        elif self.saved_file_type == "html":
            geo.render(os.path.join(self.path, "AreaMap.html"))

    def GetStars(self, star_score):
        '''
        search for grades with star_score
        :param star_score: int
        :return: a list of grades
        '''
        breakpoints = [11, 21, 31, 41, 51]
        grades = ["一星", "二星", "三星", "四星", "五星"]
        return grades[bisect.bisect(breakpoints, star_score)]

    def StarMap(self):
        '''
        Get a pie map
        :return:
        '''
        score = dict(
            Counter(
                map(self.GetStars,
                    self.GetOneCol(name="comment_score", method="average"))))
        attr, value = Geo.cast(score)
        pie = Pie(self.title, "数据来源:豆瓣电影", title_pos="center", width=900)
        pie.add("",
                attr,
                value,
                center=[50, 50],
                is_random=True,
                radius=[30, 75],
                rosetype="area",
                is_legend_show=False,
                is_label_show=True)
        if self.saved_file_type is None:
            pie.render(os.path.join(self.path, "StarMap.png"))
        elif self.saved_file_type == "html":
            pie.render(os.path.join(self.path, "StarMap.html"))

    def Cast(self, name, method=None, message=None, max_bin=100):
        '''
        casts data ,and filters data with stopwords
        :param name: colname
        :param method: to decide the func returns a dict or tuple(attr,value)
        :param message: a message the user gives,if not None, will be adding to stopwords
        :param max_bin: the max number of words on wordcloud
        :return:
        '''
        string = "".join(self.GetOneCol(name))
        brokewords = map(
            str.strip,
            open('./config/stopwords/stopwords.txt', "r",
                 encoding="utf-8").readlines())
        if message:
            brokewords = itertools.chain(brokewords, message.split(",")[:])
        stopwords = "".join(brokewords)
        lis = dict(
            Counter([
                tag.strip() for tag in analyse.extract_tags(string, max_bin)
                if tag.strip() not in stopwords
            ]))
        lis = sorted(lis.items(), key=lambda x: x[1], reverse=True)
        if method is None:
            return Geo.cast(lis)
        elif method == "dict":
            return {k[0]: k[1] for k in lis}

    def WordCloudMap(self, message=None):
        '''
        a high-class wordcloud
        :param message: messages that user gives
        :return:
        '''
        from wordcloud import WordCloud
        backgroud_path = './img/{}/background/{}.png'.format(
            self.tbname, self.tbname)
        if not os.path.exists(backgroud_path):
            backgroud_path = './img/sample/1.jpg'
        backgroud_image = plt.imread(backgroud_path)
        cloud = WordCloud(
            width=1024,
            height=768,
            font_path='./config/fonts/simhei.ttf',
            background_color='white',  # 设置背景色
            mask=backgroud_image,  # 词云形状False
            max_words=100,  # 允许最大词汇
            max_font_size=400,  # 最大号字体
            random_state=50  # 旋转角度
        )
        if message is None:
            text = self.Cast(name="comment_content", method="dict")
        else:
            message.replace(",", ",")
            if "," not in message:
                message = message + ","
            text = self.Cast(name="comment_content",
                             method="dict",
                             message=message)
        cloud.fit_words(text)  # 产生词云
        cloud.recolor(color_func=ImageColorGenerator(backgroud_image))
        plt.figure()
        plt.imshow(cloud)
        plt.axis('off')
        cloud.to_file(os.path.join(self.path, "wordcloud.png"))

    def SimpleWordCloudMap(self):
        '''
        a lower-class wordcloud
        :return:
        '''
        from pyecharts.charts.wordcloud import WordCloud
        attr, value = self.Cast(name="comment_content")
        wordcloud = WordCloud(self.title,
                              "数据来源:豆瓣电影",
                              title_pos="center",
                              width=1200,
                              height=600)
        wordcloud.add("",
                      attr,
                      value,
                      shape="diamond",
                      word_size_range=[20, 100])
        if self.saved_file_type is None:
            wordcloud.render(os.path.join(self.path, "wordcloud.png"))
        elif self.saved_file_type == "html":
            wordcloud.render(os.path.join(self.path, "wordcloud.html"))

    def close(self):
        self.conn.close()
示例#57
0
class TransPS():
    def __init__(self):
        self.cfg = configparser.ConfigParser()
        self.cfg.read("config.ini")
        cmdb_db = self.cfg.get("cmdb", "db")
        cmdb_str = self.cfg.get("cmdb", "conn_str")
        self.client = MongoClient(cmdb_str)
        self.db = self.client[cmdb_db]

    def format_server_name(self, df, col_name):
        df[col_name] = df[col_name].str.lower().map(
            lambda x: x.split('.cargosmart.com')[0])

    def trans_size_to_mb(self, size_str):
        result = re.search(r'(?P<size>\d+)\s*(?P<unit>\w*)',
                           size_str).groupdict()
        g_unit = ['G', 'GB']
        if result.get('unit') in g_unit:
            size_m = str(int(result.get('size')) * 1024)
        else:
            size_m = result.get('size')
        return size_m

    def get_osvendor(self, osversion):
        if 'windows' in osversion.lower().replace(' ', ''):
            osvendor = 'windows'
        elif 'redhat' in osversion.lower().replace(' ', ''):
            osvendor = 'redhat'
        elif 'centos' in osversion.lower().replace(' ', ''):
            osvendor = 'centos'
        elif 'esx' in osversion.lower().replace(' ', ''):
            osvendor = 'esx'
        else:
            osvendor = ''
        return osvendor

    def format_env(self, in_env_name):
        if 'PRE' in in_env_name.upper():
            env_name = 'PP'
        elif 'MAINT' in in_env_name.upper():
            env_name = 'PM'
        else:
            env_name = in_env_name.upper()
        return env_name

    def write_to_cmdb(self, coll_name, df):
        coll = self.db[coll_name]
        result = coll.delete_many({})
        logger.info("%s deleted %s rows" %
                    (coll_name, str(result.deleted_count)))
        result = coll.insert_many(json.loads(df.to_json(orient='records')))
        logger.info("%s inserted %s rows" %
                    (coll_name, str(len(result.inserted_ids))))

    def main(self):
        # server
        excel_server_coll = self.db['excel_server']
        vcenter_server_coll = self.db['vcenter_server']
        oem_server_coll = self.db['oem_server']
        vcenter_vm_coll = self.db['vcenter_virtualmachine']
        excel_server_df = pd.DataFrame(list(excel_server_coll.find()))
        vcenter_server_df = pd.DataFrame(list(vcenter_server_coll.find()))
        oem_server_df = pd.DataFrame(list(oem_server_coll.find()))
        vcenter_vm_df = pd.DataFrame(list(vcenter_vm_coll.find()))

        self.format_server_name(excel_server_df, 'excel_name')
        self.format_server_name(vcenter_server_df, 'vc_name')
        self.format_server_name(vcenter_vm_df, 'vc_name')
        self.format_server_name(oem_server_df, 'oem_name')

        # get oem physical server names by (oem servers - vcenter vms)
        tempdf = pd.merge(oem_server_df,
                          vcenter_vm_df,
                          left_on='oem_name',
                          right_on='vc_name',
                          how='left')
        oem_ps_names = tempdf.loc[tempdf['vc_name'].isnull(), 'oem_name']

        # get all physical server by union all excel, vcenter, oem physical server
        # names, and distinct
        excel_ps_names = excel_server_df['excel_name']
        vcenter_ps_names = vcenter_server_df['vc_name']
        ps_names = concat(
            [concat([excel_ps_names, vcenter_ps_names]),
             oem_ps_names]).unique()

        ps_names_df = pd.DataFrame(ps_names, columns=['ps_name'])
        join1 = pd.merge(ps_names_df,
                         excel_server_df,
                         left_on='ps_name',
                         right_on='excel_name',
                         how='left')
        join2 = pd.merge(join1,
                         vcenter_server_df,
                         left_on='ps_name',
                         right_on='vc_name',
                         how='left')
        ps_df = pd.merge(join2,
                         oem_server_df,
                         left_on='ps_name',
                         right_on='oem_name',
                         how='left').fillna(value='')
        ps_df['merge_name'] = ps_df['ps_name']

        # pdb.set_trace()

        # delete the columns

        delete_cols = [
            mongoid_col for mongoid_col in ps_df.columns
            if '_id' in mongoid_col
        ]
        delete_cols += ['excel_name', 'vc_name', 'oem_name']
        ps_df = ps_df.drop(delete_cols, axis=1)

        # set cpu num , excel > vc > oem

        ps_df['merge_cpu_num'] = ps_df['vc_cpu_num']
        ps_df.loc[ps_df.merge_cpu_num == '',
                  'merge_cpu_num'] = ps_df.loc[ps_df.merge_cpu_num == '',
                                               'oem_cpu_num']
        ps_df.loc[ps_df.merge_cpu_num == '',
                  'merge_cpu_num'] = ps_df.loc[ps_df.merge_cpu_num == '',
                                               'excel_cpu_num']

        # set cpu type, excel > vc > oem

        ps_df['vc_cpu_type'] = ps_df['vc_cpu_type'].map(
            lambda x: re.sub(r'\s\s+', ' ',
                             str(x).upper().split(' @ ')[0].strip()))

        ps_df['excel_cpu_type'] = ps_df['excel_cpu_type'].map(
            lambda x: str(x).upper().strip().replace(
                'ULTRASPARC IIII', '  ULTRASPARC-IIII').replace(
                    'INTEL(R) XEON®', 'INTEL(R) XEON(R)').replace(
                        'INTEL XEON', 'INTEL(R) XEON(R)').replace(
                            'INTEL® PENTIUM®', 'INTEL(R) PENTIUM(R)'))

        ps_df['merge_cpu_type'] = ps_df['vc_cpu_type']
        ps_df.loc[ps_df.merge_cpu_type == '',
                  'merge_cpu_type'] = ps_df.loc[ps_df.merge_cpu_type == '',
                                                'excel_cpu_type']

        # set cpu core, excel > vc > oem

        ps_df['merge_cpu_core'] = ps_df['vc_cpu_core']
        ps_df.loc[ps_df['merge_cpu_core'] == '',
                  'merge_cpu_core'] = ps_df.loc[ps_df['merge_cpu_core'] == '',
                                                'oem_cpu_num']
        ps_df.loc[ps_df['merge_cpu_core'] == '',
                  'merge_cpu_core'] = ps_df.loc[ps_df['merge_cpu_core'] == '',
                                                'excel_cpu_core']

        # set cpu speed, excel > vc > oem

        ps_df['merge_cpu_speed'] = ps_df["vc_cpu_speedGHz"]

        ps_df['merge_cpu_cache_size'] = ps_df['excel_cpu_cache_size']

        ps_df['merge_cpu_thread'] = ps_df['vc_cpu_thread']

        # set memory size, priority: vc > oem > excel

        ps_df.loc[
            ps_df['excel_memory_size'] != '', 'excel_memory_size'] = ps_df.loc[
                ps_df['excel_memory_size'] != '',
                'excel_memory_size'].map(lambda x: self.trans_size_to_mb(x))

        ps_df['merge_mem_size'] = ps_df["vc_memory_size"]

        ps_df.loc[ps_df.merge_mem_size == '',
                  'merge_mem_size'] = ps_df.loc[ps_df.merge_mem_size == '',
                                                'oem_memory_size']

        ps_df.loc[ps_df['merge_mem_size'] == '',
                  'merge_mem_size'] = ps_df.loc[ps_df['merge_mem_size'] == '',
                                                'excel_memory_size']

        # set system disk, excel
        ps_df['merge_system_disk'] = ps_df['excel_system_disk']
        ps_df['merge_external_disk'] = ps_df[
            'excel_external_disk'].str.replace('Nil', '')

        # set brand name, excel
        ps_df['merge_brand_name'] = ps_df['vc_brand_name']

        # set model , excel
        ps_df['merge_model_name'] = ps_df["vc_model_name"]

        # pdb.set_trace()

        # set os version , excel > oem
        ps_df['merge_osversion_name'] = ps_df["vc_os_version"]
        ps_df.loc[ps_df['merge_osversion_name'] == '',
                  'merge_osversion_name'] = ps_df.loc[
                      ps_df['merge_osversion_name'] == '',
                      'oem_osversion_name']

        # set os vendor
        ps_df['merge_osvendor'] = ps_df['merge_osversion_name'].map(
            lambda x: self.get_osvendor(x))

        # set environment by excel

        ps_df['merge_env_purpose'] = ps_df['excel_env_purpose'].map(
            lambda x: self.format_env(x))

        ps_df['merge_environment'] = ps_df['vc_env']

        # set others by excel

        # ps_df['merge_fiber_card2_model'] = ps_df['excel_fiber_card2_model']
        # ps_df['merge_fiber_card3_model'] = ps_df[
        #     'excel_fiber_card3_model'].str.replace(' ', '')
        # ps_df['merge_fiber_card_model'] = ps_df[
        # 'excel_fiber_card_model'].str.upper().replace('NIL', '')

        ps_df['merge_fiber_card_model'] = ps_df['vc_fiber_hba_device']
        # ps_df['merge_fiber_card_num'] = ps_df[
        #     'excel_fiber_card_num'].str.replace(' ', '').fillna('')
        ps_df['merge_fiber_card_num'] = ps_df['vc_fiber_hba_num']

        ps_df['merge_fiber_port'] = ps_df['excel_fiber_port']
        ps_df.loc[ps_df['excel_fiber_port'] != '',
                  'excel_fiber_port'] = ps_df.loc[
                      ps_df['excel_fiber_port'] != '',
                      'excel_fiber_port'].map(lambda x: str(int(x)))

        ps_df['merge_hw_model_eol_date'] = ps_df['excel_hw_model_eol_date']

        ps_df['merge_ip'] = ps_df['vc_ip']

        ps_df['merge_lan_port'] = ps_df['excel_lan_port']

        ps_df.loc[ps_df['excel_lan_port'] != '', 'excel_lan_port'] = ps_df.loc[
            ps_df['excel_lan_port'] != '',
            'excel_lan_port'].map(lambda x: str(int(x)))

        ps_df['merge_location'] = ps_df['excel_location']

        ps_df['merge_maint_from'] = ps_df['excel_maint_from'].map(
            lambda x: x.strip())

        ps_df['merge_maint_status'] = ps_df['excel_maint_status']

        ps_df['merge_maint_to'] = ps_df['excel_maint_to']

        ps_df['merge_maint_vendor'] = ps_df['excel_maint_vendor']

        ps_df['merge_os_service_pack'] = ps_df['excel_os_service_pack'].map(
            lambda x: str(x))

        ps_df['merge_power_port'] = ps_df['excel_power_port']
        ps_df.loc[ps_df['merge_power_port'] != '',
                  'merge_power_port'] = ps_df.loc[
                      ps_df['merge_power_port'] != '',
                      'merge_power_port'].map(lambda x: str(int(x)))

        ps_df['merge_power_status'] = ps_df['vc_power_status'].str.upper()

        ps_df['merge_rack_location'] = ps_df['excel_rack_location'].map(
            lambda x: x.upper().replace(' ', ''))

        ps_df['merge_serial_num'] = ps_df['excel_serial_num']
        ps_df['merge_server_function'] = ps_df['excel_server_function']

        ps_df['merge_server_type'] = ps_df['excel_server_type']

        ps_df['merge_check_by'] = ps_df['excel_check_by']

        ps_df['merge_check_date'] = ps_df['last on-site check date']

        merge_cols = [
            col.lower() for col in ps_df.columns if 'merge' in col.lower()
        ]

        ps_df = ps_df[merge_cols]

        # write to mongodb
        self.write_to_cmdb(coll_name='merge_phisical_server', df=ps_df)
        self.client.close()
示例#58
0
文件: mongodb.py 项目: tsarpaul/CAPE
class MongoDB(Report):
    """Stores report in MongoDB."""
    order = 9999

    # Mongo schema version, used for data migration.
    SCHEMA_VERSION = "1"

    def connect(self):
        """Connects to Mongo database, loads options and set connectors.
        @raise CuckooReportError: if unable to connect.
        """
        host = self.options.get("host", "127.0.0.1")
        port = self.options.get("port", 27017)
        db = self.options.get("db", "cuckoo")

        try:
            self.conn = MongoClient(
                host,
                port=port,
                username=self.options.get("username", None),
                password=self.options.get("password", None),
                authSource=db)
            self.db = self.conn[db]
        except TypeError:
            raise CuckooReportError("Mongo connection port must be integer")
        except ConnectionFailure:
            raise CuckooReportError("Cannot connect to MongoDB")

    def debug_dict_size(self, dct, parent_key=False):
        if type(dct) == list:
            dct = dct[0]

        if isinstance(dct, str) and parent_key:
            dct = {parent_key: dct}

        if not isinstance(dct, str):
            totals = dict((k, 0) for k in dct)

        def walk(root, key, val):
            if isinstance(val, dict):
                for k, v in val.iteritems():
                    walk(root, k, v)

            elif isinstance(val, (list, tuple, set)):
                for el in val:
                    walk(root, None, el)

            elif isinstance(val, basestring):
                totals[root] += len(val)

        for key, val in dct.iteritems():
            walk(key, key, val)

        return sorted(totals.items(), key=lambda item: item[1], reverse=True)

    @classmethod
    def ensure_valid_utf8(cls, obj):
        """Ensures that all strings are valid UTF-8 encoded, which is
        required by MongoDB to be able to store the JSON documents.
        @param obj: analysis results dictionary.
        """
        if not obj:
            return

        items = []
        if isinstance(obj, dict):
            items = obj.iteritems()
        elif isinstance(obj, list):
            items = enumerate(obj)

        for k, v in items:
            # This type check is intentionally not done using isinstance(),
            # because bson.binary.Binary *is* a subclass of bytes/str, and
            # we do not want to convert that.
            if type(v) is str:
                try:
                    v.decode('utf-8')
                except UnicodeDecodeError:
                    obj[k] = u''.join(unichr(ord(_))
                                      for _ in v).encode('utf-8')
            else:
                cls.ensure_valid_utf8(v)

    def run(self, results):
        """Writes report.
        @param results: analysis results dictionary.
        @raise CuckooReportError: if fails to connect or write to MongoDB.
        """
        # We put the raise here and not at the import because it would
        # otherwise trigger even if the module is not enabled in the config.
        if not HAVE_MONGO:
            raise CuckooDependencyError("Unable to import pymongo "
                                        "(install with `pip install pymongo`)")

        self.connect()

        # Set mongo schema version.
        # TODO: This is not optimal becuase it run each analysis. Need to run
        # only one time at startup.
        if "cuckoo_schema" in self.db.collection_names():
            if self.db.cuckoo_schema.find_one(
            )["version"] != self.SCHEMA_VERSION:
                CuckooReportError(
                    "Mongo schema version not expected, check data migration tool"
                )
        else:
            self.db.cuckoo_schema.save({"version": self.SCHEMA_VERSION})

        # Create a copy of the dictionary. This is done in order to not modify
        # the original dictionary and possibly compromise the following
        # reporting modules.
        report = dict(results)

        if "network" not in report:
            report["network"] = {}

        # Add screenshot paths
        report["shots"] = []
        shots_path = os.path.join(self.analysis_path, "shots")
        if os.path.exists(shots_path):
            shots = [
                shot for shot in os.listdir(shots_path)
                if shot.endswith(".jpg")
            ]
            for shot_file in sorted(shots):
                shot_path = os.path.join(self.analysis_path, "shots",
                                         shot_file)
                screenshot = File(shot_path)
                if screenshot.valid():
                    # Strip the extension as it's added later
                    # in the Django view
                    report["shots"].append(shot_file.replace(".jpg", ""))

        # Store chunks of API calls in a different collection and reference
        # those chunks back in the report. In this way we should defeat the
        # issue with the oversized reports exceeding MongoDB's boundaries.
        # Also allows paging of the reports.
        new_processes = []

        for process in report.get("behavior", {}).get("processes", []) or []:
            new_process = dict(process)
            chunk = []
            chunks_ids = []
            # Loop on each process call.
            for index, call in enumerate(process["calls"]):
                # If the chunk size is 100 or if the loop is completed then
                # store the chunk in MongoDB.
                if len(chunk) == 100:
                    to_insert = {"pid": process["process_id"], "calls": chunk}
                    chunk_id = self.db.calls.insert(to_insert)
                    chunks_ids.append(chunk_id)
                    # Reset the chunk.
                    chunk = []
                # Append call to the chunk.
                chunk.append(call)
            # Store leftovers.
            if chunk:
                to_insert = {"pid": process["process_id"], "calls": chunk}
                chunk_id = self.db.calls.insert(to_insert)
                chunks_ids.append(chunk_id)
            # Add list of chunks.
            new_process["calls"] = chunks_ids
            new_processes.append(new_process)
        # Store the results in the report.
        report["behavior"] = dict(report.get("behavior", {}))
        report["behavior"]["processes"] = new_processes

        # Calculate the mlist_cnt for display if present to reduce db load
        if "signatures" in results:
            for entry in results["signatures"]:
                if entry["name"] == "ie_martian_children":
                    report["mlist_cnt"] = len(entry["data"])
                if entry["name"] == "office_martian_children":
                    report["f_mlist_cnt"] = len(entry["data"])

        # Other info we want quick access to from the web UI
        if results.has_key("virustotal") and results["virustotal"] and results[
                "virustotal"].has_key(
                    "positives") and results["virustotal"].has_key("total"):
            report["virustotal_summary"] = "%s/%s" % (
                results["virustotal"]["positives"],
                results["virustotal"]["total"])
        if results.get("suricata", False):

            keywords = ("tls", "alerts", "files", "http", "ssh", "dns")
            keywords_dict = ("suri_tls_cnt", "suri_alert_cnt", "suri_file_cnt",
                             "suri_http_cnt", "suri_ssh_cnt", "suri_dns_cnt")
            for keyword, keyword_value in zip(keywords, keywords_dict):
                if results["suricata"].get(keyword, 0):
                    report[keyword_value] = len(results["suricata"][keyword])

        # Create an index based on the info.id dict key. Increases overall scalability
        # with large amounts of data.
        # Note: Silently ignores the creation if the index already exists.
        self.db.analysis.create_index("info.id", background=True)

        #trick for distributed api
        if results.get("info", {}).get("options", {}).get("main_task_id", ""):
            report["info"]["id"] = int(
                results["info"]["options"]["main_task_id"])

        analyses = self.db.analysis.find(
            {"info.id": int(report["info"]["id"])})
        if analyses.count() > 0:
            log.debug("Deleting analysis data for Task %s" %
                      report["info"]["id"])
            for analysis in analyses:
                log.info(analysis)
                for process in analysis["behavior"]["processes"]:
                    for call in process["calls"]:
                        self.db.calls.remove({"_id": ObjectId(call)})
                self.db.analysis.remove({"_id": ObjectId(analysis["_id"])})
            log.debug("Deleted previous MongoDB data for Task %s" %
                      report["info"]["id"])

        self.ensure_valid_utf8(report)

        # Store the report and retrieve its object id.
        try:
            self.db.analysis.save(report, check_keys=False)
        except InvalidDocument as e:
            parent_key, psize = self.debug_dict_size(report)[0]
            if not self.options.get("fix_large_docs", False):
                # Just log the error and problem keys
                log.error(str(e))
                log.error("Largest parent key: %s (%d MB)" %
                          (parent_key, int(psize) / MEGABYTE))
            else:
                # Delete the problem keys and check for more
                error_saved = True
                size_filter = MONGOSIZELIMIT
                while error_saved:
                    if type(report) == list:
                        report = report[0]
                    try:
                        if type(report[parent_key]) == list:
                            for j, parent_dict in enumerate(
                                    report[parent_key]):
                                child_key, csize = self.debug_dict_size(
                                    parent_dict, parent_key)[0]
                                if csize > size_filter:
                                    if parent_key == child_key:
                                        log.warn(
                                            "results['%s'] deleted due to size: %s"
                                            % (parent_key, csize))
                                        del report[parent_key]
                                        break
                                    else:
                                        log.warn(
                                            "results['%s']['%s'] deleted due to size: %s"
                                            % (parent_key, child_key, csize))
                                        del report[parent_key][j][child_key]
                        else:
                            child_key, csize = self.debug_dict_size(
                                report[parent_key], parent_key)[0]
                            if csize > size_filter:
                                log.warn(
                                    "else - results['%s']['%s'] deleted due to size: %s"
                                    % (parent_key, child_key, csize))
                                del report[parent_key][child_key]
                        try:
                            self.db.analysis.save(report, check_keys=False)
                            error_saved = False
                        except InvalidDocument as e:
                            parent_key, psize = self.debug_dict_size(report)[0]
                            log.error(str(e))
                            log.error("Largest parent key: %s (%d MB)" %
                                      (parent_key, int(psize) / MEGABYTE))
                            size_filter = size_filter - MEGABYTE
                    except Exception as e:
                        log.error("Failed to delete child key: %s" % str(e))
                        error_saved = False

        self.conn.close()
示例#59
0
 def _clear_database(self):
     client = MongoClient('localhost', 27017)
     client.drop_database('user')
     client.close()
class RedditParser(object):
    def __init__(self, config_data_name, logfile):
        self.set_config_data(config_data_name)
        self.subreddits = self.input["subreddits"]
        self.client_id = self.input["client_id"]
        self.client_secret = self.input["client_secret"]
        self.password = self.input["password"]
        self.username = self.input["username"]
        self.user_agent = self.input["user_agent"]
        self.starting_point_date = self.input["starting_point_date"]
        self.reddit_API = praw.Reddit(client_id=self.client_id,
                                      client_secret=self.client_secret,
                                      password=self.password,
                                      username=self.username,
                                      user_agent=self.user_agent)
        self.logfile = logfile

    def set_config_data(self, config_data_name):
        with open(config_data_name) as config_file:
            self.input = json.load(config_file)

    def create_connection_db(self):
        #self.mongo_connection = MongoClient(os.environ['DB_PORT_27017_TCP_ADDR'], 27017) #Use for Docker
        self.mongo_connection = MongoClient()
        self.db = self.mongo_connection.reddit_data

    def close_connection_db(self):
        self.mongo_connection.close()

    def create_mongo_collection_and_index(self, collection_name):
        self.create_connection_db()
        if collection_name not in self.db.collection_names():
            self.db[collection_name].create_index(
                [("_id", ASCENDING), ("created_date", DESCENDING),
                 ("subreddit", ASCENDING)],
                name="reddit_items_index",
                unique=True,
                dropDups=1)
        self.close_connection_db()

    def get_submissions_and_comments(self, collection, testing_purpose):
        try:
            start_point_date = datetime.datetime.strptime(
                self.starting_point_date, "%Y-%m-%d %H:%M:%S")
            start_unix_time = time.mktime(start_point_date.timetuple())

            if testing_purpose == True:
                end_point_date = start_point_date + timedelta(hours=1)
                end_unix_time = time.mktime(end_point_date.timetuple())
            else:
                end_unix_time = None

            reddit = self.reddit_API
            submissions = []
            comments = []
            for subreddit_string in self.subreddits:
                logging.info(
                    "Starting to fetch reddit data for subreddit %s from %s ... "
                    % (subreddit_string, start_unix_time))
                subreddit = reddit.subreddit(subreddit_string)
                subreddit_submissions = [
                    submission for submission in subreddit.submissions(
                        start=start_unix_time, end=end_unix_time)
                ]
                subreddit_comments = [
                    comment for submission in subreddit_submissions
                    for comment in submission.comments
                ]

                self.create_connection_db()
                for submission in subreddit_submissions:
                    self.db[collection].save(
                        RedditItem(submission.id, submission.title, None,
                                   subreddit_string,
                                   submission.created_utc).item)

                for comment in subreddit_comments:
                    self.db[collection].save(
                        RedditItem(comment.id, None, comment.body,
                                   subreddit_string, comment.created_utc).item)
                self.close_connection_db()
                logging.info(
                    "Successfully update reddit data for subreddit %s from %s ... "
                    % (subreddit_string, start_unix_time))

                submissions += subreddit_submissions
                comments += subreddit_comments

            logging.info("Succesfully updated reddit data from %s ... " %
                         start_unix_time)
            return submissions + comments

        except Exception as ex:
            logging.error(
                "Exception ocurred, please find out the exception message : %s"
                % ex.message)