예제 #1
0
def chinese_train(cl):
    for file in glob.glob(TRAIN_DIR + "*.xlsx"):
        print file
        excel_file = xlrd.open_workbook(file)
        for sheet_name in excel_file._sheet_names:
            sheet = excel_file.sheet_by_name(sheet_name)
            for i in range(sheet.nrows):
                cat = None
                url = None
                content = None
                try:
                    cat = sheet.cell(colx=0, rowx=i).value
                    cat = str(cat).decode('utf-8', 'ignore')
                    cat = excel_to_json.purify_search_word(cat)
                    cat = cat.strip()
                    if not cat:
                        continue
                except IndexError:
                    continue

                try:
                    url = sheet.cell(colx=1, rowx=i).value
                    url = str(url).decode('utf-8', 'ignore')
                    url = url.strip()
                    if not url:
                        continue
                except IndexError:
                    continue

                print url
                try:
                    content = sheet.cell(colx=2, rowx=i).value
                    content = str(content).decode('utf-8', 'ignore')
                    content = excel_to_json.purify_search_word(content)
                    content = content.strip()
                    if not content:
                        continue
                except IndexError:
                    continue

                if cat is None or content is None:
                    continue
                body = content.decode('utf-8', 'ignore')
                body = body.replace('\01', ' ')
                if not chinese.search(body):
                    continue

                cl.train(body, cat)
예제 #2
0
def excel_test(file_name):
    if not os.path.exists(file_name):
        print("file: %s not exists" % file_name)
        return

    excel_file = xlrd.open_workbook(file_name)
    for sheet_name in excel_file.sheet_names():
        print("Sheet name: %s" % sheet_name)
        sheet = excel_file.sheet_by_name(sheet_name)
        for i in range(sheet.nrows):
            if i == 0:
                continue
            url = None
            title = None
            keywords = None
            description = None
            content = None

            try:
                url = sheet.cell(colx=0, rowx=i).value
                url = str(url).decode('utf-8', 'ignore')
                url = url.strip()
                if not url:
                    continue
            except IndexError:
                continue

            print(url)
            try:
                title = sheet.cell(colx=1, rowx=i).value
                title = str(title).decode('utf-8', 'ignore')
                title = excel_to_json.purify_search_word(title)
                title = title.strip()
                if not title:
                    continue
            except IndexError:
                continue

            try:
                keywords = sheet.cell(colx=2, rowx=i).value
                if keywords is not None:
                    keywords = str(keywords).decode('utf-8', 'ignore')
                    keywords = excel_to_json.purify_search_word(keywords)
                    keywords = keywords.strip()
            except IndexError:
                pass

            try:
                description = sheet.cell(colx=3, rowx=i).value
                if description is not None:
                    description = str(description).decode('utf-8', 'ignore')
                    description = excel_to_json.purify_search_word(description)
                    description = description.strip()
            except IndexError:
                pass

            try:
                a_content = sheet.cell(colx=4, rowx=i).value
                a_content = str(a_content).decode('utf-8', 'ignore')
                a_content = excel_to_json.purify_search_word(a_content)
                a_content = a_content.strip()
                p_content = sheet.cell(colx=5, rowx=i).value
                p_content = str(p_content).decode('utf-8', 'ignore')
                p_content = excel_to_json.purify_search_word(p_content)
                p_content = p_content.strip()
                content = a_content + p_content
                content = content.replace('\01', ' ')
            except IndexError:
                pass

            if url is None or title is None:
                continue

            item = dict()
            item[Classifier.KEY_TITLE] = title
            if keywords is not None:
                item[Classifier.KEY_KEYWORDS] = keywords

            if description is not None:
                item[Classifier.KEY_DESCRIPTION] = description

            if content is not None:
                item[Classifier.KEY_CONTENT] = content

            cat, prob = cl.classify(item, default=Classifier.UNKNOWN_CATEGORY)
            print(u"\t".join([cat, str(prob), url]))
예제 #3
0
def chinese_train(cl):
    for file in glob.glob(TRAIN_DIR + EXCEL_PREFIX):
        print(file)
        excel_file = xlrd.open_workbook(file)
        for sheet_name in excel_file._sheet_names:
            sheet = excel_file.sheet_by_name(sheet_name)
            for i in range(sheet.nrows):
                if i == 0:
                    continue
                cat = None
                url = None
                title = None
                keywords = None
                description = None
                content = None
                try:
                    cat = sheet.cell(colx=0, rowx=i).value
                    cat = str(cat).decode('utf-8', 'ignore')
                    cat = excel_to_json.purify_search_word(cat)
                    cat = cat.strip()
                    if not cat:
                        continue
                except IndexError:
                    continue

                try:
                    url = sheet.cell(colx=1, rowx=i).value
                    url = str(url).decode('utf-8', 'ignore')
                    url = url.strip()
                    if not url:
                        continue
                except IndexError:
                    continue

                print(url)
                try:
                    title = sheet.cell(colx=2, rowx=i).value
                    title = str(title).decode('utf-8', 'ignore')
                    title = excel_to_json.purify_search_word(title)
                    title = title.strip()
                    if not title:
                        continue
                except IndexError:
                    continue

                try:
                    keywords = sheet.cell(colx=3, rowx=i).value
                    if keywords is not None:
                        keywords = str(keywords).decode('utf-8', 'ignore')
                        keywords = excel_to_json.purify_search_word(keywords)
                        keywords = keywords.strip()
                except IndexError:
                    continue

                try:
                    description = sheet.cell(colx=4, rowx=i).value
                    if description is not None:
                        description = str(description).decode(
                            'utf-8', 'ignore')
                        description = excel_to_json.purify_search_word(
                            description)
                        description = description.strip()
                except IndexError:
                    continue

                try:
                    a_content = sheet.cell(colx=5, rowx=i).value
                    a_content = str(a_content).decode('utf-8', 'ignore')
                    a_content = excel_to_json.purify_search_word(a_content)
                    a_content = a_content.strip()
                    p_content = sheet.cell(colx=6, rowx=i).value
                    p_content = str(p_content).decode('utf-8', 'ignore')
                    p_content = excel_to_json.purify_search_word(p_content)
                    p_content = p_content.strip()
                    content = a_content + p_content
                    content = content.replace('\01', ' ')
                except IndexError:
                    continue

                if cat is None or url is None:
                    continue

                item = dict()
                item[Classifier.KEY_TITLE] = title
                if keywords is not None:
                    item[Classifier.KEY_KEYWORDS] = keywords

                if description is not None:
                    item[Classifier.KEY_DESCRIPTION] = description

                if content is not None:
                    item[Classifier.KEY_CONTENT] = content

                cl.train(item, cat)
예제 #4
0
                               TEST_EXCEL_PATTERN):
        # print(file_name)
        excel_file = xlrd.open_workbook(file_name)
        for sheet_name in excel_file._sheet_names:
            sheet = excel_file.sheet_by_name(sheet_name)
            for i in range(sheet.nrows):
                cat = None
                url = None
                title = None
                keywords = None
                description = None
                content = None
                try:
                    cat = sheet.cell(colx=0, rowx=i).value
                    cat = str(cat).decode('utf-8', 'ignore')
                    cat = excel_to_json.purify_search_word(cat)
                    cat = cat.strip()
                    if not cat:
                        continue
                except IndexError:
                    continue

                try:
                    url = sheet.cell(colx=1, rowx=i).value
                    url = str(url).decode('utf-8', 'ignore')
                    url = url.strip()
                    if not url:
                        continue
                except IndexError:
                    continue