def word2phrase_filelevel(formatCorpusFile, depFile, outputPhraseCorpusFile, outputPhraseDepFile):
    """
    func: transfer word based corpus into phrase based one
    :param formatCorpusFile: the file that needs to be transfered
    :param depFile: the corresponding dependency file
    :param outputPhraseCorpusFile: the transfered phrase based output file
    :param outputPhraseDepFile: the transfered phrase based dependency file
    :return: n/a
    """
    phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst')
    sentences_triples = readInDependencyTriples(depFile)
    sentences, clbls, flbls = get_english_raw_sentences_labels(formatCorpusFile)
    assert len(sentences) == len(sentences_triples)

    newDepinfo = []
    newCorpus = []

    for i, sent in enumerate(sentences):
        sent_triples = sentences_triples[i]
        newdeps = mergeDependencyTree(sent_triples, sent, phrasemap)
        newsent = word2phrase_sentencelevel(sent, phrasemap)
        newDepinfo.append(formDependencyTripleLine(newdeps))
        newCorpus.append(flbls[i]+'\t'+newsent+'\n')

    with open(outputPhraseCorpusFile, 'w') as writer:
        writer.writelines(newCorpus)
    with open(outputPhraseDepFile, 'w') as writer:
        writer.writelines(newDepinfo)
def word2phrase_filelevel(formatCorpusFile, depFile, outputPhraseCorpusFile,
                          outputPhraseDepFile):
    """
    func: transfer word based corpus into phrase based one
    :param formatCorpusFile: the file that needs to be transfered
    :param depFile: the corresponding dependency file
    :param outputPhraseCorpusFile: the transfered phrase based output file
    :param outputPhraseDepFile: the transfered phrase based dependency file
    :return: n/a
    """
    phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst')
    sentences_triples = readInDependencyTriples(depFile)
    sentences, clbls, flbls = get_english_raw_sentences_labels(
        formatCorpusFile)
    assert len(sentences) == len(sentences_triples)

    newDepinfo = []
    newCorpus = []

    for i, sent in enumerate(sentences):
        sent_triples = sentences_triples[i]
        newdeps = mergeDependencyTree(sent_triples, sent, phrasemap)
        newsent = word2phrase_sentencelevel(sent, phrasemap)
        newDepinfo.append(formDependencyTripleLine(newdeps))
        newCorpus.append(flbls[i] + '\t' + newsent + '\n')

    with open(outputPhraseCorpusFile, 'w') as writer:
        writer.writelines(newCorpus)
    with open(outputPhraseDepFile, 'w') as writer:
        writer.writelines(newDepinfo)
def preprocessEnglishCorpus(CorpusFile, preprocessedFile):
    """
    func: preprocess corpus
    params: CorpusFile: corpus file path
    params: preprocessedFile: the output preprocessed file
    return: n/a
    """
    print 'preprocessing...'
    reader = open(CorpusFile, 'r')
    buffsize = 250000000
    buffcount = 0
    open(preprocessedFile, 'w').close()
    phrasemap = buildBilingualDict.phraseMapping('../data/phrase.lst')
    while True:
        outputbuffer = []
        lines = reader.readlines(buffsize)
        if not lines:
            break
        else:
            buffcount += 1
            print 'building with ' + str(buffcount) + ' buffer.....'
        for line in lines:
            words = line.split()
            newwords = []
            for word in words:
                newwords.extend(english_word_filter(word))
            newwords = english_phrase_filter(newwords, phrasemap)
            outputbuffer.append(' '.join(newwords) + '\n')
        print 'writing buffer...'
        with open(preprocessedFile, 'a') as writer:
            writer.writelines(outputbuffer)
    reader.close()