def run(): collection = mongo.task.data_report while True: items = collection.find({'processStatus': 0}).limit(10) for item in items: try: startDate, endDate = item['param']['startDate'], item['param'][ 'endDate'] logger.info('processing %s ~ %s.xlsx' % (startDate, endDate)) df, columns = data_code.run2(conn, mongo, startDate=startDate, endDate=endDate, param=item['param']) df.to_excel('test.xlsx', index=0, columns=columns, encoding="utf-8") path = os.path.join(sys.path[0], 'test.xlsx') fileid = util.get_uuid() oss = oss2_helper.Oss2Helper("xiniudata-report") fp = file(path, "rb") oss.put( fileid, fp, headers={ "Content-Type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "x-oss-meta-filename": 'funding_news_report_%s~%s.xlsx' % (startDate, endDate) }) fp.close() logger.info('uploaded funding_news_report_%s ~ %s.xlsx' % (startDate, endDate)) collection.update_one({'_id': item['_id']}, { '$set': { 'processStatus': 1, 'link': 'http://www.xiniudata.com/file/report/%s' % fileid } }) except Exception as e: logger.info(e) logger.info('sleep') time.sleep(30)
def process(rep): res = 0 while True: delete() res += 1 if res > 20: return False run(rep["durl"]) logger.info("saving done") file_path = "download.pdf" if not os.path.isfile(file_path): return False # logger.info(file_path) # try: # fp = open(file_path, "rb") # pdfReader = PdfFileReader(fp) # logger.info("read done") # if pdfReader.isEncrypted: # return False # # except: # continue # pages = pdfReader.getNumPages() pages, pdfcreationDate = getPage(file_path) if pdfcreationDate is None: return False # fp.close() size = os.path.getsize(file_path) md5 = util.get_file_md5(file_path) if check_file_exists(md5, rep["title"]): return False fileid = util.get_uuid() logger.info("%s, %s, %s, %s, %s, %s", rep["title"], size, pdfcreationDate, pages, md5, fileid) oss = oss2_helper.Oss2Helper("xiniudata-report") fp = file(file_path, "rb") oss.put(fileid, fp, headers={ "Content-Type": "application/pdf", "x-oss-meta-filename": rep["filename"] }) fp.close() mongo = db.connect_mongo() mongo.article.report.insert_one({ "source": rep["source"], "description": None, "title": rep["title"], "filename": rep["filename"], "size": size, "pdfCreationDate": pdfcreationDate, "pages": pages, "md5": md5, "fileid": fileid, "createTime": datetime.datetime.now() - datetime.timedelta(hours=8), "modifyTime": datetime.datetime.now() - datetime.timedelta(hours=8), "type": 78001 }) mongo.close() return True
import oss2_helper #logger loghelper.init_logger("stock_aggregate", stream=True) logger = loghelper.get_logger("stock_aggregate") source_map = { 13400: "全国中小企业股份转让系统|http://www.neeq.com.cn", 13401: "上海证券交易所|http://www.sse.com.cn", 13402: "深圳证券交易所|http://www.szse.cn" } round_map = {13400: 1105, 13401: 1110, 13402: 1110} # kafka kafkaProducer = None oss2put = oss2_helper.Oss2Helper() def init_kafka(): global kafkaProducer (url) = config.get_kafka_config() kafka = KafkaClient(url) # HashedPartitioner is default kafkaProducer = SimpleProducer(kafka) def send_message(company_id, action): if kafkaProducer is None: init_kafka() #action: create, delete
def process(org): if org["coldcall_imap_server"] is None: return logger.info("orgId: %s, orgName: %s", org["id"], org["name"]) re_name = re.compile( '([\[\(] *)?(RE?S?|FYI|RIF|I|FS|VB|RV|ENC|ODP|PD|YNT|ILT|SV|VS|VL|AW|WG|ΑΠ|ΣΧΕΤ|ΠΡΘ|תגובה|הועבר|主题|转发|FWD?) *([-:;)\]][ :;\])-]*|$)|\]+ *$', re.IGNORECASE) while True: msgs = email_reader.receive(org["coldcall_imap_server"], org["coldcall_imap_port"], org["coldcall_username"], org["coldcall_password"], one=True) if len(msgs) == 0: break for msg in msgs: if msg["html"] is not None: parser = html2text.HTML2Text() parser.ignore_emphasis = True parser.single_line_break = True msg["html_text"] = parser.handle(msg["html"]) else: msg["html_text"] = None logger.info(msg["subject"]) logger.info(msg["from"]) logger.info(msg["to"]) logger.info(msg["cc"]) # logger.info(msg["body"]) # logger.info(msg["html_text"]) logger.info("attachments=%d" % len(msg["attachments"])) for attach in msg["attachments"]: logger.info(attach.name) title = re_name.sub('', msg["subject"]).strip() title_md5 = util.md5str(title) #insert conn = db.connect_torndb() cc = conn.get( "select * from sourcedeal where orgId=%s and titleMd5=%s and origin=%s limit 1", org["id"], title_md5, msg["from"]) conn.close() if cc is not None: logger.info("%s Exists!" % title) continue content = msg["html_text"] if content is None: content = msg["body"] if content is None: content = "" content = content.strip() if len(content) > 20000: content = content[0:20000] sponsor_id = find_user(org["id"], msg["from"]) logger.info("sponsor_id=%s" % sponsor_id) assignee_id = find_user(org["id"], msg["cc"]) logger.info("assignee_id=%s" % assignee_id) conn = db.connect_torndb() cc_id = conn.insert( "insert sourcedeal(title,titleMd5,content,orgId,createTime,origin,assignee,sponsor) \ values(%s,%s,%s,%s,%s,%s,%s,%s)", title, title_md5, content, org["id"], msg["date"], msg["from"], assignee_id, sponsor_id) if assignee_id is None: ids = get_investment_manager_ids(org["id"]) assignee_id = choice(ids) conn.update("update sourcedeal set assignee=%s where id=%s", assignee_id, cc_id) conn.insert( "insert sourcedeal_forward(sourcedealId,toUserId,createTime) " "values(%s,%s,%s)", cc_id, assignee_id, msg["date"]) else: conn.insert( "insert sourcedeal_forward(sourcedealId,fromUserId,toUserId,createTime) " "values(%s,%s,%s,%s)", cc_id, sponsor_id, assignee_id, msg["date"]) for attach in msg["attachments"]: if attach.name is not None and attach.name.strip() != "": name = attach.name.strip() if not name.lower().endswith("pdf") and \ not name.lower().endswith("rar") and \ not name.lower().endswith("zip") and \ not name.lower().endswith("7z") and \ not name.lower().endswith("ppt") and \ not name.lower().endswith("pptx") and \ not name.lower().endswith("doc") and \ not name.lower().endswith("docx") and \ not name.lower().endswith("xls") and \ not name.lower().endswith("xlsx"): continue (content_type, encoding) = mimetypes.guess_type(name) if content_type is None: content_type = "application/octet-stream" data = attach.getvalue() # mongo = db.connect_mongo() # imgfs = gridfs.GridFS(mongo.gridfs) # logo_id = imgfs.put(data, content_type=content_type, filename=name) # mongo.close() logo_id = util.get_uuid() logger.info("gridfs logo_id=%s" % logo_id) oss2 = oss2_helper.Oss2Helper() headers = {"Content-Type": content_type} oss2.put(str(logo_id), data, headers=headers) conn.insert( "insert sourcedeal_file(sourcedealId,filename,fileId,createTime) " "values(%s,%s,%s,%s)", cc_id, name, logo_id, msg["date"]) conn.close()
from bson import ObjectId reload(sys) sys.setdefaultencoding("utf-8") sys.path.append( os.path.join(os.path.split(os.path.realpath(__file__))[0], '../util')) import loghelper, db, util, oss2_helper #logger loghelper.init_logger("migrate_file", stream=True) logger = loghelper.get_logger("migrate_file") mongo = db.connect_mongo() grid = GridFS(mongo.gridfs) oss2 = oss2_helper.Oss2Helper() def save_oss2_image(grid_id): if grid_id is None or grid_id.strip() == "": return item = mongo.temp.gridid.find_one({"gridid": grid_id}) if item is not None: return out = grid.get(ObjectId(grid_id)) logger.info(out.name) img, xsize, ysize = util.convert_image(out, out.name) headers = {"Content-Type": "image/jpeg"} oss2.put(grid_id, img, headers=headers)
def process(dir_path, filename): # logger.info(filename) file_path = os.path.join(dir_path, filename) if not os.path.isfile(file_path): return False if not filename.lower().endswith(".pdf"): return False # logger.info(file_path) fp = file(file_path, "rb") pdfReader = PdfFileReader(fp) if pdfReader.isEncrypted: fp.close() logger.info("File encrypted! filename: %s", filename) decrypt_pdf(file_path) fp = file(file_path, "rb") pdfReader = PdfFileReader(fp) # creationDate = pdfReader.documentInfo.get("/CreationDate") # if not isinstance(creationDate, str): # try: # creationDate = creationDate.getObject() # except: # traceback.print_exc() # return False pages = pdfReader.getNumPages() fp.close() # try: # datestring = creationDate[2:-7] # ts = strptime(datestring, "%Y%m%d%H%M%S") # except: # traceback.print_exc() # return False # dt = datetime.fromtimestamp(mktime(ts)) - timedelta(hours=8) ts = os.path.getctime(file_path) dt = datetime.fromtimestamp(ts) - timedelta(hours=8) size = os.path.getsize(file_path) title = filename[0:-4].strip() source = None if u":" in title: strs = title.split(u":", 1) source = strs[0] title = strs[1] md5 = util.get_file_md5(file_path) if check_file_exists(md5, title): return True fileid = util.get_uuid() logger.info("%s, %s, %s, %s, %s, %s", title, size, dt, pages, md5, fileid) oss = oss2_helper.Oss2Helper("xiniudata-report") fp = file(file_path, "rb") oss.put(fileid, fp, headers={ "Content-Type": "application/pdf", "x-oss-meta-filename": filename.strip() }) fp.close() save(source, filename, title, size, dt, pages, md5, fileid) return True