示例#1
0
 def clean_data(cls, content):
     charset_name = ''
     try:
         dict = json.loads(content)
     except Exception, e:
         mlog.log().error(e)
         mlog.log().error(content)
示例#2
0
 def is_connected(self):
     try:
         self.voidcmd("NOOP")
         return True
     except Exception, e:
         mlog.log().error('ftp error:%s' % e)
         return False
示例#3
0
def test():
    config = {
        'ftp': {
            'type': 1,
            'host': '61.147.114.73',
            'port': 21,
            'user': '******',
            'passwd': '123456x',
            'timeout': 5,
            'local': './'
        }
    }

    analysis_engine = AnalysisEngine(config)
    file_list = analysis_engine.input_data('~/text_storage/60006bak')
    i = 0
    count = len(file_list)
    while i < count:
        tlist = file_list[i:i + 4]
        i += 5
        start_time = time.time()
        for t in tlist:
            analysis_engine.process_file_data(60006, '~/text_storage/60006bak',
                                              t, 0)
        end_time = time.time()
        mlog.log().info("analysis file count %d  expend %d", i,
                        end_time - start_time)
示例#4
0
def main():
    sqlmgr = SQLiteExt("./text.db", 0)

    try:
        create_table_sql = '''CREATE TABLE `search` (
                        `id` BIGINT NOT NULL,
                        `uid` BIGINT NULL,
                        `title` VARCHAR(128) NULL,
                        `text` VARCHAR(40960) NULL,
                        `created_at` INT NULL,
                        `retweet_count` INT NULL,
                        `reply_count` INT NULL,
                        `fav_count` INT NULL,
                        `retweet_id` INT NULL,
                        `type` INT NULL,
                        `source_link` VARCHAR(256),
                        `edited_at` INT NULL,
                        `pic` VARCHAR(256) NULL,
                        `target` VARCHAR(256) NULL,
                        `source` VARCHAR(256) NULL,
                        PRIMARY KEY (`id`));'''

        sqlmgr.create_table(create_table_sql)
    except Exception, e:
        mlog.log().error("Create table failed")
示例#5
0
 def create_table(self, sql):
     if sql is not None and sql != '':
         cur = self.__get_cursor()
         cur.execute(sql)
         self.conn.commit()
         self.__close_all(cur)
     else:
         mlog.log().error('the [{}] is empty or equal None!'.format(sql))
示例#6
0
 def run(self):
     try:
         if len(self.wait_queue):
             item = self.wait_queue.pop(0)
             self.engine.save(item['db'],
                              item['data'])
     except Exception, e:
         mlog.log().error('save error:%s' % e)
         pass
示例#7
0
文件: common.py 项目: flaght/mcrawler
 def nickname_format(cls, str,content):
     reply_nickname = None
     try:
         tree = ET.fromstring(str[2:len(str) - 1])
         lst_node = tree.getiterator('a')
         for node in lst_node:
             reply_nickname = node.text[1:len(node.text)]
             break
     except Exception, e:
         mlog.log().error(str + "===>" + content)
示例#8
0
 def drop_table(self, table):
     """如果表存在,则删除表,如果表中存在数据的时候,使用该
     方法的时候要慎用!"""
     if table is not None and table != '':
         sql = 'DROP TABLE IF EXISTS ' + table
         cur = self.__get_cursor()
         cur.execute(sql)
         self.conn.commit()
         self.__close_all(cur)
     else:
         mlog.log().error('the [{}] is empty or equal None!'.format(table))
示例#9
0
 def save(self, sql, data):
     '''插入数据'''
     if sql is not None and sql != '':
         if data is not None:
             cur = self.__get_cursor()
             for d in data:
                 cur.execute(sql, d)
                 self.conn.commit()
             self.__close_all(cur)
     else:
         mlog.log().error('the [{}] is empty or equal None!'.format(sql))
示例#10
0
 def __get_uid(self, content):
     dict = {}
     data = content.get('dict')
     for key in data:
         mlog.log().info("tabel name %s content %d", key, len(data[key]))
         for t in data[key]:
             uid = t[1]
             dict[uid] = uid
     return {
         'pid': local_task_opercode.XUEQIU_GET_DISCUSSION_UID,
         'result': dict
     }
示例#11
0
文件: common.py 项目: flaght/mcrawler
 def face_format(cls, content):
     re_t = re.compile('(<img src=.//assets\\.imedao\\.com).*?(images).*?(face).*?(title).*?(alt).*?(>)', re.DOTALL)
     list = []
     for m in re_t.finditer(content):
         try:
             tree = ET.fromstring(m.group())
             if tree.attrib.has_key('title'):
                 value = tree.attrib['title']
                 r = {'start': m.start(), 'end': m.end(), 'str': m.group(), 'value': value}
                 list.append(r)
         except Exception, e:
             mlog.log().error(m.group())
示例#12
0
    def __conection_sql(self):
        try:
            self.conn = sqlite3.connect(self.name, self.timeout)
            if os.path.exists(self.name) and os.path.isfile(self.name):
                self.type = 0
            else:
                self.conn = sqlite3.connect(':memory:')
                self.type = 1

        except Exception, e:
            mlog.log().error('sqlite3 error:%s' % e)
            return
示例#13
0
 def __get_conn(self, path):
     """获取到数据库的连接对象,参数为数据库文件的绝对路径
     如果传递的参数是存在,并且是文件,那么就返回硬盘上面改
     路径下的数据库文件的连接对象;否则,返回内存中的数据接
     连接对象"""
     conn = sqlite3.connect(path)
     if os.path.exists(path) and os.path.isfile(path):
         mlog.log().info('硬盘上面:[{}]'.format(path))
         return conn
     else:
         conn = None
         mlog.log().info('内存上面:[:memory:]')
         return sqlite3.connect(':memory:')
示例#14
0
 def fetch(self, sql):
     queue = []
     if sql is not None and sql != '':
         cur = self.__get_cursor()
         cur.execute(sql)
         r = cur.fetchall()
         if len(r) > 0:
             for e in range(len(r)):
                 queue.append(r[e])
         return queue
     else:
         mlog.log().error('the [{}] is empty or equal None!'.format(sql))
         return None
示例#15
0
 def __handle_all_file(self, pid, path):
     file_list = self.scheduler_engine.input_data(path)
     i = 0
     count = len(file_list)
     while i < count:
         unit_list = file_list[i:i + 5]
         i += 5
         start_time = time.time()
         for t in unit_list:
             self.__handle_single_file(pid, path, t)
         end_time = time.time()
         mlog.log().info("analysis file count %d expend %d", i,
                         end_time - start_time)
示例#16
0
 def fetchall_data(self, pid):
     dict = {}
     result = self.sql_mgr.get_table()
     for t in result:
         s_t = "".join(t)
         mlog.log().info("tablename %s ", s_t)
         if pid == local_task_opercode.XUEQIU_GET_MEMBER_MAX:
             sql = xqdb.get_member_max(s_t)
         elif pid == pid == local_task_opercode.XUEQIU_GET_DISCUSSION_UID:
             sql = xqdb.get_user_discuss_max(s_t)
         else:
             sql = xqdb.get_id(s_t)
         dict[s_t] = self.sql_mgr.get_data(sql)
     return dict
示例#17
0
 def run(self):
     """
     连接取数据
     """
     while True:
         consumer = KafkaConsumer(bootstrap_servers=self.host)
         consumer.subscribe([self.coname])
         for message in consumer:
             try:
                 json_info = json.loads(message[6])
                 print json_info
                 #self.callback(json_info)
             except Exception, e:
                 mlog.log().error(e)
示例#18
0
def run(console):
    analysis_engine = AnalysisEngine()
    file_list = analysis_engine.input_data(console.path)
    i = 0
    count = len(file_list)
    while i < count:
        tlist = file_list[i:i + 4]
        i += 5
        start_time = time.time()
        for t in tlist:
            analysis_engine.process_file_data(console.plt_id, console.path, t,
                                              0)
        end_time = time.time()
        mlog.log().info("analysis file count %d  expend %d", i,
                        end_time - start_time)
示例#19
0
def parser_ftp_method(config, path, pid):
    ae = AnalysisEngine(config)
    ae.start()
    file_list = ae.input_data(path)
    i = 0
    count = len(file_list)
    while i < count:
        unit_list = file_list[i:i + 5]
        i += 5
        start_time = time.time()
        for t in unit_list:
            ae.process_file_data(pid, path, t, 0)
        end_time = time.time()
        mlog.log().info("analysis file count %d  expend %d", i,
                        end_time - start_time)
示例#20
0
    def create_table(self,crate_table_sql, type = 1):
        """

        Args:
            crate_table_sql: 创建表的SQL语句
            type: 0 删除原有的 1.保留原有的

        Returns:

        """
        if type == 0:
            drop_table_sql =  'DROP TABLE IF EXISTS ' + self.table
            self.engine.drop_table(drop_table_sql)
        try:
            self.engine.create_table(crate_table_sql)
        except Exception, e:
            mlog.log().error('create_table error:%s' % e)
示例#21
0
 def day_heat(self, content):
     symbol = ""
     tlist = []
     try:
         tree = ET.fromstring(content)
         lst_node = tree.getiterator('Title')
         for node in lst_node:
             if node.attrib.has_key("id") > 0:
                 symbol = node.attrib['id']
         lst_node = tree.getiterator('Individual')
         for node in lst_node:
             for c in node:
                 if c.attrib.has_key("d") > 0:
                     d = c.attrib["d"]
                     v = c.attrib["v"]
                     tlist.append((d, int(v)))
     except Exception, e:
         mlog.log().error("error content")
示例#22
0
 def __clean_search_event(self, content):
     dt = {}
     d = content['dict']
     for key, value in d.items():
         lt = []
         for t in value:
             #replpy = xq_common.quote_format(t[3])
             try:
                 dic = Discussion()
                 reply = dic.parser_int(t[3])
                 l = list(t)
                 s = json.dumps(reply)
                 l.append(s.decode('unicode-escape'))
                 lt.append(l)
             except Exception, e:
                 mlog.log().error("https://xueqiu.com/" + str(t[1]) + "/" +
                                  str(t[0]))
         dt[key] = lt
示例#23
0
 def get(self, basic_path, filename, callback=None):
     if not self.ping():
         return False
     try:
         #self.ftp.cwd('~/text_storage')
         path_list = basic_path.split('/')
         for _path in path_list:
             self.ftp.cwd(_path)
         #self.ftp.cwd(basic_path)
         file_size = self.ftp.size(filename)
         if callback is None:
             self.ftp.retrbinary('RETR ' + filename, self.callback,
                                 file_size)
         else:
             self.ftp.retrbinary('RETR ' + filename, callback, file_size)
         return True
     except Exception, e:
         mlog.log().error("ftp error:%s url:%s", e, filename)
         return False
示例#24
0
 def quarter_heat(self, content):
     symbol = ""
     tlist = []
     try:
         tree = ET.fromstring(content)
         lst_node = tree.getiterator('Title')
         for node in lst_node:
             if node.attrib.has_key("id") > 0:
                 symbol = node.attrib['id']
         lst_node = tree.getiterator('Individual')
         for node in lst_node:
             for c in node:
                 if c.attrib.has_key("d") > 0:
                     d = c.attrib["d"]
                     dlist = d.split(' ')
                     date = dlist[0]
                     hour = dlist[1]
                     v = c.attrib["v"]
                     changerate = c.attrib["changerate"]
                     tlist.append((d, int(v), changerate))
     except Exception, e:
         mlog.log().error("error content")
示例#25
0
 def __u_connect(self):
     self.ftp.set_pasv(True, self.host)
     try:
         if not self.ftp.connect(self.host, self.port, self.timeout):
             mlog.log().error("connect ftp server failed")
             return False
         if not self.ftp.login(self.name, self.pwd):
             mlog.log().error("login ftp server failed")
             return False
         self.is_connected = True
         mlog.log().info("host : " + self.host + "  ftp login success")
         return True
     except Exception, e:
         mlog.log().error("ftp error[%s]", e)
         return False
示例#26
0
class CleaningCrawler():
    @classmethod
    def clean_data(cls, content):
        charset_name = ''
        try:
            dict = json.loads(content)
        except Exception, e:
            mlog.log().error(e)
            mlog.log().error(content)
        data = ''
        # 解base64
        try:
            data = base64.b32decode(dict['content'])
            charset_name = dict['charset']
        except Exception, e:
            mlog.log().error(e)
            return None
示例#27
0
    def run(self):
        while True:
            try:
                self.pool.poll()
            except KeyboardInterrupt:
                mlog.log().error("**** Interrupted!")
                break
            except NoResultsPending:
                mlog.log().error("**** No pending results.")
                break

        if self.pool.dismissedWorkers:
            mlog.log().info("Joining all dismissed worker threads...")
            self.pool.joinAllDismissedWorkers()
示例#28
0
def main():
    mlog.log().info('Python %s on %s' % (sys.version, sys.platform))
    sys_str = platform.system()
    mlog.log().info(sys_str)
    if platform.system() == "Darwin" or platform.system() == "Linux":
        reload(sys)
        sys.setdefaultencoding('utf-8')  # @UndefinedVariable
    os.chdir(os.getcwd())

    ### 控制台输出
    console = Console()
    console.input_info()

    pool = Pool(processes=3)

    result = pool.apply_async(run, (console, ))

    pool.close()
    pool.join()
    if result.successful():
        mlog.log().info("successful")
示例#29
0
        except Exception, e:
            mlog.log().error(e)
            mlog.log().error(content)
        data = ''
        # 解base64
        try:
            data = base64.b32decode(dict['content'])
            charset_name = dict['charset']
        except Exception, e:
            mlog.log().error(e)
            return None
        # 解压缩
        try:
            data = zlib.decompress(data)
        except Exception, e:
            mlog.log().error(e)
            return None

        # 解字符串码
        try:
            data = data.decode(charset_name)
        except Exception, e:
            mlog.log().error(e)
            return None

        url = dict.get('url')
        pid = dict.get('pid')
        if url is not None:
            url = base64.b32decode(url)

        if pid is not None and url is not None:
示例#30
0
 def log(self):
     mlog.log().info(self.host)
     mlog.log().info(self.port)
     mlog.log().info(self.name)
     mlog.log().info(self.pwd)