예제 #1
0
#!/usr/bin/python
from mongodb import mongodb
import sys
import filters
db = sys.argv[1]
mdb = mongodb.mongodb('localhost', 27017, db)

i = 0

with open(db + '_domaine.txt', 'w') as fw:
    fw.write('**** *domaine\n')
    for domaine in mdb.selectall('metadatas'):
        fw.write(domaine['domaine'])
        fw.write('\n')
with open(db + '_metadatas.txt', 'w') as fw:
    fw.write('**** *metadata\n')
    for domaine in mdb.selectall('metadatas'):
        meta = domaine['meta']
        for filt in filters.filters_metadata:
            meta = meta.replace(filt, '')
            meta = meta.replace(filt.swapcase(), '')
        fw.write(meta.encode('ascii', 'ignore'))
        fw.write('\n')

fw.close()
예제 #2
0
 def __init__(self, db):
     self.mdb = mongodb.mongodb('localhost', 27017, db)
     self.white_list = []
     self.white_domaine = [
         'msn.com', 'google.com', 'wikipedia.fr', 'free.fr', 'linkedin.com'
     ]
예제 #3
0
 def __init__(self, host, db):
     self.db = mongodb.mongodb(host, 27017, db)
     self.networks = {}
예제 #4
0
파일: app3.py 프로젝트: FeilyZhang/PkuLaw
        return f.read()


ret = readCSV('dict.txt')
s = []
for ele in ret.split('\n'):
    s.append(ele)

'''
抽取并按月份合并关键词
'''
cols = ['law_detail', 'justice_detail', 'rule_detail', 'dept_detail', 'industry_detail', 'party_detail']
rst = [[], [], [], [], [], [], [], [], [], [], [], []]
dic = [{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}]
for i in range(0, len(cols)):
    for ele in mongodb().find_all('pkulaw', cols[i]):
        # 以下循环用于简化12个条件分支
        for j in range(0, 12):
            if ele['pub_date'] != '' and int(ele['pub_date'].split('.')[1]) > j  and int(ele['pub_date'].split('.')[1]) < j + 2 and int(ele['pub_date'].split('.')[2]) < 50:
                for e in jieba.analyse.extract_tags(getContent(str(ele['content'][0])) , topK=30, withWeight=False, allowPOS=()):
                    if e in s:
                        rst[j].append(e)
'''
按月份统计词频
'''
for i in range(0, len(rst)):
    for e in rst[i]:
        if e not in dic[i].keys():
            dic[i][e] = 1
        else:
            dic[i][e] += 1
예제 #5
0
import sys
import glob
import os
from mongodb import mongodb

pathdirectory=glob.glob(sys.argv[1])
db=sys.argv[2]
mdb=mongodb.mongodb('localhost',27017,db)
with open(db+'_cleaned.log','w') as fw:
	for name_file in pathdirectory:
		fileName, fileExtension =os.path.splitext(name_file)
		tokens=fileName.split('/')
		domaine=tokens[len(tokens)-1]
		results=mdb.selectbycreteria('domaine',domaine,'new_domaines')
	
		for result in results:
			 fw.write(result['ip']+';'+result['domaine']+'\n')
fw.close()
예제 #6
0
 def __init__(self, host, db):
     self.db = mongodb.mongodb(host, 27017, db)
     self.networks = {}
예제 #7
0
'''
'''
# 获取并存储法律
for ele in mongodb().find_all('pkulaw1', 'law'):
    print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'law_details'))

# 获取并存储行政法规
for ele in mongodb().find_all('pkulaw1', 'rule'):
    print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'rule_details'))

# 获取并存储司法解释
for ele in mongodb().find_all('pkulaw1', 'justice'):
    print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'justice_details'))
'''

# 获取并存储部门规章
for ele in mongodb().find_all('pkulaw1', 'dept'):
   print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'dept_details'))

# 获取并存储党内法规
for ele in mongodb().find_all('pkulaw1', 'party'):
    print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'party_details'))
'''
# 获取并存储团体规定
for ele in mongodb().find_all('pkulaw', 'group'):
   print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'group_detail'))
'''
# 获取并存储行业规定
for ele in mongodb().find_all('pkulaw1', 'industry'):
   print(parsing().get_html(ele).get_json(mongodb(), 'pkulaw1', 'industry_details'))
예제 #8
0
 def __init__(self, cols):
     self.__mongodb = mongodb()
     self.__sets = list()
     self.__realSets = list()
     self.__indexs = list()
     self.__cols = cols
예제 #9
0
파일: pkulaw.py 프로젝트: FeilyZhang/PkuLaw
 def __init__(self):
     self.__browser_instance = browser(self.__url, self.__exe_path, self.__service_log_path)
     self.__mongo = mongodb()
예제 #10
0
 def __init__(self,db):
     self.mdb=mongodb.mongodb('localhost',27017,db)
     self.white_list=[]
     self.white_domaine=['msn.com','google.com','wikipedia.fr','free.fr','linkedin.com']