示例#1
0
def main(lines, output_dir, window, create_vw=True, encoding="utf-8"):

    global OUTPUT_FILE_VW, OUTPUT_FILE_VOCAB, OUTPUT_FILE_PMI, cooc_dict, word_dict, vocab, WINDOW, ENCODING
    checkDirectory(output_dir)
    OUTPUT_FILE_VW = output_dir + 'ww_vw.txt'
    OUTPUT_FILE_VOCAB = output_dir + 'vocab.txt'
    OUTPUT_FILE_PMI = output_dir + 'pmi.txt'
    WINDOW = window
    ENCODING = encoding

    start_time = time.time()
    cooc_dict = defaultdict(int)
    word_dict = defaultdict(int)
    vocab = set()

    for line in tqdm(lines):
        words = line.split()
        __process_post(cooc_dict, word_dict, vocab, words)
    if create_vw:
        __save_vw(cooc_dict, vocab)
    __save_vocab(list(vocab))
    vocab_dict = {word: index for index, word in enumerate(list(vocab))}
    __save_pmi(cooc_dict, word_dict, vocab_dict)

    print '\nTime elapsed: ' + str(round(time.time() - start_time,
                                         3)) + ' sec.'
示例#2
0
def make_set_each_topic(min_number=5, verbose=False):
    """
    Delete identical sentences from train text
    :param min_number: The minimum value of lines for topic to be included in model
    :return: print the number of processed files
    """
    files_number = 0
    checkDirectory(folder_prepare)
    for file_name in os.listdir(input_path):
        if not file_name.startswith(".") and not file_name.endswith(
                ".ipynb") and not file_name.endswith(
                    ".py") and "for" not in file_name:
            with codecs.open(input_path + file_name, "r",
                             encoding="utf-8") as inputFile:
                lines = inputFile.read().splitlines()
            if len(lines) > min_number:
                files_number += 1
                if verbose:
                    print file_name
                lines = [line.strip() for line in lines if line]
                with codecs.open(folder_prepare + file_name,
                                 "w",
                                 encoding="utf-8") as outputFile:
                    print >> outputFile, "\n\n".join([lines[0]] +
                                                     list(set(lines[1:])))
示例#3
0

if __name__ == '__main__':

    source_file = sys.argv[1]

    ENCODING = 'utf8'
    SOURCE_FILE = sys.argv[1]
    WINDOW = 7
    CREATE_VW = True
    FOLDER = sys.argv[2]
    OUTPUT_FILE_VW = FOLDER + 'ww_vw.txt'
    OUTPUT_FILE_VOCAB = FOLDER + 'vocab.txt'
    OUTPUT_FILE_PMI = FOLDER + 'pmi.txt'

    checkDirectory(FOLDER)
    start_time = time.time()
    cooc_dict = defaultdict(int)
    word_dict = defaultdict(int)
    vocab = set()

    with codecs.open(SOURCE_FILE, 'r', ENCODING) as f:
        for line in tqdm(f):
            words = line.split()[2:]
            __process_post(cooc_dict, word_dict, vocab, words)
    if CREATE_VW:
        __save_vw(cooc_dict, vocab)
    __save_vocab(list(vocab))
    vocab_dict = {word: index for index, word in enumerate(list(vocab))}
    __save_pmi(cooc_dict, word_dict, vocab_dict)
    print '\nTime elapsed: ' + str(round(time.time() - start_time,
示例#4
0
def requestVoiceFileFromKanWiki(kanUrl, savePath, kanNameKN):
    requestHeaders = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'}

    print('----------------------------------\nStart')
    print('Obtain from {}'.format(kanUrl))
    res = get(kanUrl, headers = requestHeaders).content.decode('utf-8')
    etHtml = etree.HTML(res)
    kanName = etHtml.xpath('normalize-space(//h1[@id="firstHeading"])')

    print('Name:{}'.format(kanName))
    kanVoiceFolder = path.join(savePath, 'voice', kanNameKN)
    checkDirectory(kanVoiceFolder)

    voiceBlock = etHtml.xpath('.//body//div[@id="mw-content-text"]/div[@class="mw-parser-output"]/*')
    parserTag = False
    voiceSetType = ''
    voiceList = []

    defineVoiceJSFile = ''
    defineVoiceSetName = []

    #固有语音
    for blockNode in voiceBlock:
        if blockNode.xpath('normalize-space(./span/text())') == '语音资料':
            parserTag = True
            continue
        if blockNode.xpath('normalize-space(./span/text())') == '游戏资料':
            parserTag = False
            continue
        if not parserTag:
            continue
        #从这里开始
        if blockNode.tag == 'h3':
            voiceSetType = blockNode.xpath('normalize-space(./span/text())')
            if voiceSetType.count('游戏') > 0:
                parserTag = False
            else:
                print('Obtain {} voice block'.format(voiceSetType))
            continue

        voiceDect = {}

        if blockNode.tag == 'table':
            # print(len(blockNode.xpath('.//tr')))
            for voiceNode in blockNode.xpath('.//tr'):
                #跳过抬头
                if len(voiceNode.xpath('./th')) > 0:
                    continue
                if len(voiceNode.xpath('./td')) == 3:
                    voiceDect['voiceAudioUrl'] = voiceNode.xpath('normalize-space(./td[1]//a/@data-filesrc)')
                    voiceDect['voiceType'] = voiceNode.xpath('normalize-space(./td[2]//text())')
                    voiceDect['voiceTextJP'] = voiceNode.xpath('normalize-space(./td[3]//text())')
                if len(voiceNode.xpath('./td')) == 1:
                    voiceDect['voiceTextCH'] = voiceNode.xpath('normalize-space(./td//text())')
                    voiceDect['voiceSetType'] = voiceSetType
                    voiceDect['voiceAudioFile'] = downloadFile(voiceDect['voiceAudioUrl'], kanVoiceFolder + "/").split('/')[-1]
                    voiceList.append(deepcopy(voiceDect))
            continue
        if blockNode.tag == 'p' and len(blockNode.xpath('./script')) > 0:
            defineVoiceJSFile = blockNode.xpath('normalize-space(./script/@src)')

        if blockNode.tag == 'script':
            defineVoiceSetName.append(search(r'(?<=").*?(?=")', blockNode.xpath('normalize-space(.//text())')).group(0))

    #限定语音
    defineVoiceJSStr = get(defineVoiceJSFile, headers = requestHeaders).content.decode('utf-8')
    defineVoiceUrlHeader = search(r'(?<=data-filesrc=\\").*?(?=")', defineVoiceJSStr).group(0)
    defineJsonUrl = search(r'(?<=")https.*?json(?=")', defineVoiceJSStr).group(0).replace(".json","").replace("wid","").replace("\"","").replace(" ","").replace("+","")
    for defineType in defineVoiceSetName:
        try:
            defineVoiceDataStr = get("{0}{1}.json".format(defineJsonUrl, defineType), headers = requestHeaders).content.decode('utf-8')
            defineVoiceDataDect = loads(defineVoiceDataStr)
            for defineSetType in defineVoiceDataDect:
                for defineItem in defineVoiceDataDect[defineSetType]:
                    voiceDect['voiceSetType'] = defineSetType
                    voiceDect['voiceType'] = defineVoiceDataDect[defineSetType][defineItem]['vname']
                    voiceDect['voiceTextJP'] = defineVoiceDataDect[defineSetType][defineItem]['ja']
                    voiceDect['voiceTextCH'] = defineVoiceDataDect[defineSetType][defineItem]['zh']
                    voiceDect['voiceAudioUrl'] = defineVoiceUrlHeader + defineVoiceDataDect[defineSetType][defineItem]['file']
                    voiceDect['voiceAudioFile'] = downloadFile(voiceDect['voiceAudioUrl'], kanVoiceFolder + "/").split('/')[-1]
                    voiceList.append(deepcopy(voiceDect))
        except:
            continue

    jsonSaveFile = open(path.join(kanVoiceFolder, '{}.json'.format(kanNameKN)), 'w', encoding='utf-8')
    jsonSaveFile.write(dumps({'nameCH':kanName, 'nameKN':kanNameKN, 'voice':voiceList}, ensure_ascii=False))
    jsonSaveFile.close()
    print("Finish\n----------------------------------")