示例#1
0
def get_detail(link):
    print(link)
    bf = BloomFilter()
    if not bf.isContains(link):
        dic = {'url': link}
        col.insert(dic)
        bf.insert(link)
        try:
            next_page(link)
        except Exception as e:
            print(e)
    else:
        logger.warning('exists : {}'.format(link))
示例#2
0
 def startCommit(self):
     customers = self.readAllLinesFromExcel(self.customer_info_path,
                                            'Sheet1')
     for customer in customers:
         customer_id = customer[0]
         customer_catalogs = customer[3].split(',')
         customer_sources = customer[4].split(',')
         customer_data_folder = '{0}/{1}/{2}'.format(
             self.data4customers_path, customer_id, self.today)
         customer_data_folder_txt = '{0}/{1}/{2}/txt'.format(
             self.data4customers_path, customer_id, self.today)
         customer_data_exists = os.path.exists(customer_data_folder)
         self.bf = BloomFilter(self.rconn, customer_id)
         if customer_data_exists is False:
             os.makedirs(customer_data_folder)
             os.makedirs(customer_data_folder_txt)
         for catalog in customer_catalogs:
             for source in customer_sources:
                 self.commitSingleCatalogSource(customer_id, catalog,
                                                source,
                                                customer_data_folder,
                                                customer_data_folder_txt)
     time_elapsed = time.time() - self.since
     self.writeToTxt(
         self.log_path,
         "{0}: commit done! in {1}m {2}s".format(str(self.getCurrntTime()),
                                                 time_elapsed // 60,
                                                 time_elapsed % 60))
     print 'commit done! in {0}m {1}s'.format(time_elapsed // 60,
                                              time_elapsed % 60)
示例#3
0
 def __init__(self):
     self.rconn = redis.Redis('127.0.0.1', 6379)
     self.bf_huxiu = BloomFilter(self.rconn, 'supplier:commit_huxiu')
     self.class_finished_path = '/home/dev/Data/Production/catalogs'
     self.today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
     self.log_path = '/home/dev/Data/Production/log/{0}_log.log'.format(
         self.today)
     self.customer_info_path = '/home/dev/Data/Production/customerInfo/customers.xlsx'
     self.data4customers_path = '/home/dev/Data/Production/data4customers'
     self.model_huxiu_title_path = '/home/dev/Data/npl/classifier/fastText/model_data/news_fasttext.model.huxiu.bin'
     self.model_huxiu_content_path = '/home/dev/Data/npl/classifier/fastText/model_data/news_fasttext.model.huxiu_content.bin'
     self.classifier_content = fasttext.load_model(
         self.model_huxiu_content_path)
     self.classifier_title = fasttext.load_model(
         self.model_huxiu_title_path)
     self.since = time.time()
示例#4
0
文件: sentiment.py 项目: hulu7/news
 def __init__(self):
     self.rconn = redis.Redis('127.0.0.1', 6379)
     self.bf = BloomFilter(self.rconn, 'supplier:merge')
示例#5
0
文件: sentiment.py 项目: hulu7/news
class Sentiment():
    def __init__(self):
        self.rconn = redis.Redis('127.0.0.1', 6379)
        self.bf = BloomFilter(self.rconn, 'supplier:merge')

    def isDuplicated(self, title):
        title_encode = str(title).encode("utf-8")
        if self.bf.isContains(title_encode):
            print 'Title {0} exists!'.format(title)
            return True
        else:
            self.bf.insert(title_encode)
            print 'Title {0} not exist!'.format(title)
            return False

    def storeFinished(self, title):
        print 'Start to store title: {0}'.format(title)
        title_encode = title.encode("utf-8")
        self.bf.insert(title_encode)

    def readFromCSV(self, filePath):
        content = []
        with open(filePath, 'r') as scv_file:
            content = list(csv.reader(scv_file))
        scv_file.close()
        return content

    def writeToCSVWithHeader(self, filePath, content, header):
        with open(filePath, 'a') as csv_file:
            csv_file.write(codecs.BOM_UTF8)
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(header)
            if len(content) > 0 and type(content) == type(content[0]):
                for item in content:
                    csv_writer.writerow(item)
            else:
                csv_writer.writerow(content)
        csv_file.close()

    def writeToCSVWithoutHeader(self, filePath, content):
        with open(filePath, 'a') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(content)
        csv_file.close()

    def readFromTxt(self, file_path):
        with open(file_path, 'r') as txt_file:
            content = txt_file.read()
            split_list = re.split('\n', content)
        txt_file.close()
        return list(filter(None, split_list))

    def readColsFromCSV(self, file_path, col_names):
        cols = pd.read_csv(file_path, usecols=col_names)
        return cols

    def analysis(self, filein_path, fileout_path):
        isFileInExists = os.path.exists(filein_path)
        isFileOutExists = os.path.exists(fileout_path)
        if isFileInExists is False:
            print 'in file: {0} not exits.'.format(filein_path)
            return
        if isFileOutExists is False:
            print 'out file: {0} not exits.'.format(fileout_path)
            self.writeToCSVWithoutHeader(fileout_path, [
                'share_number', 'comment_number', 'url', 'title', 'sentiment'
            ])
            print 'create an new out file: {0}.'.format(fileout_path)

        in_content = self.readFromCSV(filein_path)
        in_content.pop(0)
        for item in in_content:
            s = xmnlp.sentiment(item[3])
            self.writeToCSVWithoutHeader(
                fileout_path, [item[0], item[1], item[2], item[3], s])
            print "{0}--{1}".format(item[3], s)
示例#6
0
 def __init__(self):
     self.rconn = redis.Redis('127.0.0.1', 6379)
     self.bf = BloomFilter(self.rconn, 'supplier:classification')
     self.today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
示例#7
0
class UpdateProductionClass():
    def __init__(self):
        self.rconn = redis.Redis('127.0.0.1', 6379)
        self.bf = BloomFilter(self.rconn, 'supplier:classification')
        self.today = time.strftime('%Y-%m-%d', time.localtime(time.time()))

    def isDuplicated(self, title):
        title_encode = str(title).encode("utf-8")
        if self.bf.isContains(title_encode):
            print 'Title {0} exists!'.format(title)
            return True
        else:
            self.bf.insert(title_encode)
            print 'Title {0} not exist!'.format(title)
            return False

    def storeFinished(self, title):
        print 'Start to store title: {0}'.format(title)
        title_encode = title.encode("utf-8")
        self.bf.insert(title_encode)

    def readFromCSV(self, filePath):
        content = []
        with open(filePath, 'r') as scv_file:
            content = list(csv.reader(scv_file))
        scv_file.close()
        return content

    def writeToCSVWithHeader(self, filePath, content, header):
        with open(filePath, 'a') as csv_file:
            csv_file.write(codecs.BOM_UTF8)
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(header)
            if type(content) == type(content[0]):
                for item in content:
                    csv_writer.writerow(item)
            else:
                csv_writer.writerow(content)
        csv_file.close()

    def writeToCSVWithoutHeader(self, filePath, content):
        with open(filePath, 'a') as csv_file:
            csv_file.write(codecs.BOM_UTF8)
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(content)
        csv_file.close()

    def readFromTxt(self, file_path):
        with open(file_path, 'r') as txt_file:
            content = txt_file.read()
            split_list = re.split('\n', content)
        txt_file.close()
        return list(filter(None, split_list))

    def writeToTxt(self, filePath, content):
        with open(filePath, 'a+') as txt_file:
            txt_file.write(content)
            txt_file.write('\n')
        return txt_file.close()

    def extractKeyWords(self, content):
        text = content.decode("utf-8").encode("utf-8")
        seg_text = jieba.cut(text.replace("\t", " ").replace("\n", " "))
        outline = " ".join(seg_text)
        return outline

    def extractTime(self, content):
        toGBK = content.encode('gbk')
        getDigit = filter(str.isdigit, toGBK)
        getYMD = getDigit[0:8]
        return getYMD

    def getCurrntTime(self):
        return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def loadModel(self, model_path):
        self.classifier = fasttext.load_model(model_path)

    def predictClass(self, content):
        keywords = self.extractKeyWords(content)
        return self.classifier.predict_proba([keywords], k=1)[0]

    def startClassify(self, txt_path, content_path, class_finished_path,
                      model_path, log_path, catalogs, name):
        self.log_path = log_path
        self.content_path = content_path
        self.class_finished_path = class_finished_path
        self.model_path = model_path
        self.catalogs = catalogs
        self.txt_path = txt_path
        isContentFileExists = os.path.exists(content_path)
        if isContentFileExists is False:
            return
        content = self.readFromCSV(content_path)
        self.finishedIds = []
        for catalog in catalogs:
            catalog_file_path = '{0}/{1}'.format(self.class_finished_path,
                                                 catalog)
            catalog_cache_file_path = '{0}/{1}/cache'.format(
                self.class_finished_path, catalog)
            catalog_txt_file_path = '{0}/{1}/txt'.format(
                self.class_finished_path, catalog)
            isCatalogFilePathExists = os.path.exists(catalog_file_path)
            isFinishedFilePathExists = os.path.exists(self.class_finished_path)
            isCatalogCacheFilePathExists = os.path.exists(
                catalog_cache_file_path)
            isCatalogTxtFilePathExists = os.path.exists(catalog_txt_file_path)
            if isFinishedFilePathExists is False:
                os.mkdir(self.class_finished_path)
            if isCatalogFilePathExists is False:
                os.mkdir(catalog_file_path)
            if isCatalogCacheFilePathExists is False:
                os.mkdir(catalog_cache_file_path)
            if isCatalogTxtFilePathExists is False:
                os.mkdir(catalog_txt_file_path)
            catalog_path = '{0}/{1}/{2}_{3}.csv'.format(
                self.class_finished_path, catalog, self.today, catalog)
            catalog_cache_path = '{0}/{1}/cache/{2}_{3}_cache.csv'.format(
                self.class_finished_path, catalog, self.today, name)
            isCatalogFileExists = os.path.exists(catalog_path)
            isCatalogCacheFileExists = os.path.exists(catalog_cache_path)
            if isCatalogCacheFileExists is True:
                cache_list = self.readFromCSV(catalog_cache_path)
                for cache_item in cache_list[1:]:
                    self.finishedIds.append(
                        str(cache_item[0].replace('\xef\xbb\xbf', '')))
            else:
                self.writeToCSVWithoutHeader(catalog_cache_path, ['id'])
            if isCatalogFileExists is False:
                self.writeToCSVWithoutHeader(catalog_path, [
                    'id', 'title', 'url', 'time', 'catalog', 'deep',
                    'is_open_cache', 'source', 'author_name', 'images'
                ])
        total = '0'
        for item in content:
            if content.index(item) == 0:
                self.id_index = item.index('id')
                self.title_index = item.index('title')
                self.url_index = item.index('url')
                self.time_index = item.index('download_time')
                self.is_open_cache = item.index('is_open_cache')
                self.source = item.index('source')
                self.author_name = item.index('author_name')
                self.images = item.index('images')
                continue
            id = item[self.id_index]
            title = item[self.title_index]
            url = item[self.url_index]
            time_ = item[self.time_index]
            is_open_cache = item[self.is_open_cache]
            source = item[self.source]
            author_name = item[self.author_name]
            images = item[self.images]
            if len(title) == 0 or len(url) == 0 or len(time_) == 0:
                self.finishedIds.append(id)
                continue
            if self.isDuplicated(title) is False:
                file = '{0}_{1}.txt'.format(name, id)
                most_possible = self.predictClass(title)
                catalog = str(most_possible[0][0].split('__')[2])
                deep = dep.howDeep('{0}/{1}'.format(self.txt_path, file))
                if len(str(deep)) == 0:
                    self.writeToTxt(
                        log_path,
                        "{0}: empty {1}".format(str(update.getCurrntTime()),
                                                file))
                    continue
                catalog_cache_path = '{0}/{1}/cache/{2}_{3}_cache.csv'.format(
                    self.class_finished_path, catalog, self.today, name)
                catalog_path = '{0}/{1}/{2}_{3}.csv'.format(
                    self.class_finished_path, catalog, self.today, catalog)
                YMD = self.extractTime(time_)
                self.writeToCSVWithoutHeader(catalog_cache_path, [id])
                self.finishedIds.append(id)
                self.storeFinished(title)
                self.writeToCSVWithoutHeader(catalog_path, [
                    id, title, url, YMD, catalog, deep, is_open_cache, source,
                    author_name, images
                ])
                origin_txt_path = '{0}/{1}'.format(self.txt_path, file)
                classed_txt_path = '{0}/{1}/txt/{2}'.format(
                    self.class_finished_path, catalog, file)
                copyfile(origin_txt_path, classed_txt_path)

                time_elapsed = time.time() - since
                total = str(len(self.finishedIds))
                print(
                    total + ' complete in {:.0f}m {:.0f}s'.format(
                        time_elapsed // 60, time_elapsed % 60))
        time_elapsed = time.time() - since
        self.writeToTxt(
            log_path,
            str(update.getCurrntTime() + ": " + total +
                ' classify done! in {:.0f}m {:.0f}s'.format(
                    time_elapsed // 60, time_elapsed % 60)))
        print 'classify done! in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60)
示例#8
0
def to_bloom(link):
    bf = BloomFilter()
    bf.insert(link)
示例#9
0
 def __init__(self):
     self.rconn = redis.Redis('127.0.0.1', 6379)
     self.bf = BloomFilter(self.rconn, 'supplier:merge')
     self.filein = '/home/dev/Data/Production/data4customers'
     self.fileout = '/home/dev/Data/Production/data4deepinews'
     self.today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
示例#10
0
class ProductionMerge():
    def __init__(self):
        self.rconn = redis.Redis('127.0.0.1', 6379)
        self.bf = BloomFilter(self.rconn, 'supplier:merge')
        self.filein = '/home/dev/Data/Production/data4customers'
        self.fileout = '/home/dev/Data/Production/data4deepinews'
        self.today = time.strftime('%Y-%m-%d', time.localtime(time.time()))

    def isDuplicated(self, title):
        title_encode = str(title).encode("utf-8")
        if self.bf.isContains(title_encode):
            print 'Title {0} exists!'.format(title)
            return True
        else:
            self.bf.insert(title_encode)
            print 'Title {0} not exist!'.format(title)
            return False

    def storeFinished(self, title):
        print 'Start to store title: {0}'.format(title)
        title_encode = title.encode("utf-8")
        self.bf.insert(title_encode)

    def readFromCSV(self, filePath):
        content = []
        with open(filePath, 'r') as scv_file:
            content = list(csv.reader(scv_file))
        scv_file.close()
        return content

    def writeToCSVWithHeader(self, filePath, content, header):
        with open(filePath, 'a') as csv_file:
            csv_file.write(codecs.BOM_UTF8)
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(header)
            if len(content) > 0 and type(content) == type(content[0]):
                for item in content:
                    csv_writer.writerow(item)
            else:
                csv_writer.writerow(content)
        csv_file.close()

    def writeToCSVWithoutHeader(self, filePath, content):
        with open(filePath, 'a') as csv_file:
            csv_writer = csv.writer(csv_file)
            csv_writer.writerow(content)
        csv_file.close()

    def readFromTxt(self, file_path):
        with open(file_path, 'r') as txt_file:
            content = txt_file.read()
            split_list = re.split('\n', content)
        txt_file.close()
        return list(filter(None, split_list))

    def readColsFromCSV(self, file_path, col_names):
        cols = pd.read_csv(file_path, usecols=col_names)
        return cols

    def getCatalog(self):
        return {
            'finance': '财经',
            'politics': '党政',
            'comic': '动漫',
            'house': '房产',
            'home': '家居',
            'health': '健康',
            'edu': '教育',
            'military': '军事',
            'tech': '科技',
            'history': '历史',
            'travel': '旅游',
            'food': '美食',
            'agriculture': '农业',
            'car': '汽车',
            'emotion': '情感',
            'design': '设计',
            'society': '社会',
            'photography': '摄影',
            'collect': '收藏',
            'digital': '数码',
            'sports': '体育',
            'culture': '文化',
            'game': '游戏',
            'entertainment': '娱乐',
            'baby': '育儿',
            'IT': 'IT互联网',
            'career': '职场',
            'life': '养生',
            'lottery': '彩票',
            'pet': '宠物',
            'fashion': '时尚',
            'festival': '节日',
            'funny': '幽默',
            'psychology': '心理',
            'story': '故事汇',
            'wedding': '婚礼',
            'Movie': '电影',
            'TV': '电视',
            'buddhism': '佛教',
            'government': '政府',
            'astrology': '星座'
        }

    def Merge(self):
        catalog = self.getCatalog()
        users = os.listdir(self.filein)
        out_csv_file = "{0}/{1}.csv".format(self.fileout, self.today)
        output_content = []
        finished_titles = []
        for user in users:
            in_csv_file = "{0}/{1}/{2}/{3}.csv".format(self.filein, user,
                                                       self.today, self.today)
            isCsvExists = os.path.exists(in_csv_file)
            if isCsvExists is False:
                continue
            csv_content = self.readFromCSV(in_csv_file)
            if len(csv_content) < 2:
                continue

            isOutputPathExists = os.path.exists(out_csv_file)
            if isOutputPathExists is False:
                self.writeToCSVWithoutHeader(out_csv_file, [
                    'title', 'url', 'time', 'catalog', 'user', 'source',
                    'images'
                ])

            for item in csv_content[1:]:
                if self.isDuplicated(item[1]) is True:
                    for content in output_content:
                        if content[0] == item[1]:
                            if user not in content[4]:
                                content[4] = "{0},{1}".format(content[4], user)
                    continue

                if item[5] == '':
                    continue
                if item[3] == '':
                    item[3] = str(self.today).replace('-', '')
                output_content.append([
                    item[1], item[2], item[3], catalog[item[4]], user, item[7],
                    item[9]
                ])
                self.storeFinished(item[1])

        for content in output_content:
            self.writeToCSVWithoutHeader(out_csv_file, content)