示例#1
0
def build_comparable_ldamodel_training(comp_folder, dialect):
    counter = 0
    folders = [comp_folder + dialect[0] + '/', comp_folder + dialect[1] + '/']
    for folder in folders:
        for file in os.listdir(folder):

            extension = os.path.splitext(file)[1]
            if extension == '.txt':

                filepath = os.path.join(folder, file)
                texts = premodel.read_text(filepath)
                if counter == 0:
                    dictionary = corpora.Dictionary(texts)
                else:
                    dictionary.add_documents(texts)
                counter = counter + 1

    dictionary.save('parameters/comp_LDAmodel_' + dialect[1] + '.dict')
    corpus = [
        dictionary.doc2bow(text)
        for text in list(premodel.read_set_of_file(folders[0]))
    ]
    corpus = corpus + [
        dictionary.doc2bow(text)
        for text in list(premodel.read_set_of_file(folders[1]))
    ]
    corpora.MmCorpus.serialize('parameters/comp_LDAmodel_' + dialect[1] +
                               '.mm', corpus)  # store to disk, for later use

    return dictionary, corpus
示例#2
0
def training_phase(folder, dialect):
    counter = 0

    for file in os.listdir(folder):
        extension = os.path.splitext(file)[1]
        if extension == '.txt':
            filepath = os.path.join(folder, file)
            texts = premodel.read_text(filepath)
            if counter == 0:
                dictionary = corpora.Dictionary(texts)

            else:
                dictionary.add_documents(texts)

            #ct.add_documents
    #print(len(dictionary))

    #  Bag of words
    #dictionary = corpora.Dictionary(texts)
    dictionary.compactify(
    )  # remove gaps in id sequence after words that were removed
    dictionary.save('parameters/' + dialect[0] + '_' + dialect[1] + '.dict')

    # - collect statistics about all tokens (trainig_data)
    corpus = [
        dictionary.doc2bow(text)
        for text in list(premodel.read_set_of_file(folder))
    ]
    corpora.MmCorpus.serialize('parameters/' + dialect[0] + '_' + dialect[1] +
                               '.mm', corpus)  # store to disk, for later use

    return dictionary, corpus
def compute_lsi(folder, dialect, corpus_files):
    dictionary, corpus = models.training_phase(folder, dialect)
    corpus_lsi, lsi = models.build_lsi_model(corpus, dictionary)

    # summation = 0
    # with open(corpus_files[1], encoding='utf-8') as f:  # we can define file_name
    #     documents = f.read().splitlines()
    #count = 0
    for document in premodel.read_full_text(corpus_files[1]):
        vec_lsi = models.test_corpus(' '.join(document), lsi, dictionary)
        models.compute_similarity(vec_lsi, corpus_lsi, dialect)

    print('Number of document in {0} = {1}'.format(dialect[0], len(corpus)))
    print('Number of document in {0} = {1}'.format(
        dialect[1], len(premodel.read_text(corpus_files[1]))))
示例#4
0
def build_ldamodel_training(folder, dialect):
    counter = 0

    for file in os.listdir(folder):
        extension = os.path.splitext(file)[1]
        if extension == '.txt':
            filepath = os.path.join(folder, file)
            texts = premodel.read_text(filepath)
            if counter == 0:
                dictionary = corpora.Dictionary(texts)

            else:
                dictionary.add_documents(texts)
            counter = counter + 1

    # texts = [['bank', 'river', 'shore', 'water'],
    #          ['river', 'water', 'flow', 'fast', 'tree'],
    #          ['bank', 'water', 'fall', 'flow'],
    #          ['bank', 'bank', 'water', 'rain', 'river'],
    #          ['river', 'water', 'mud', 'tree'],
    #          ['money', 'transaction', 'bank', 'finance'],
    #          ['bank', 'borrow', 'money'],
    #          ['bank', 'finance'],
    #          ['finance', 'money', 'sell', 'bank'],
    #          ['borrow', 'sell'],
    #          ['bank', 'loan', 'sell']]

# dictionary = Dictionary(texts)
    dictionary.save('parameters/LDAmodel_' + dialect[1] + '.dict')
    corpus = [
        dictionary.doc2bow(text)
        for text in list(premodel.read_set_of_file(folder))
    ]
    corpora.MmCorpus.serialize('parameters/LDAmodel_' + dialect[1] + '.mm',
                               corpus)  # store to disk, for later use

    return dictionary, corpus
    corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt']

    dictionary_memory_friendly, corpus_memory_friendly = lsi_model.training_phase(
        folder, dialect)

    dictionary, corpus = premodel.upload_data(dialect)
    corpus_lsi, lsi = lsi_model.build_lsi_model(corpus, dictionary)

    summation = 0
    with open(corpus_files[1],
              encoding='utf-8') as f:  # we can define file_name
        documents = f.read().splitlines()
    count = 0
    for document in premodel.read_full_text(corpus_files[1]):

        vec_lsi = lsi_model.test_corpus(' '.join(document), lsi, dictionary)
        similarity = lsi_model.compute_similarity(vec_lsi, corpus_lsi, dialect)

        # compute the avg similarities cross the corpus
        #summation = summation + sum(y for _, y in similarity if y > 0)
        # for x,y in similarity :
        #     if not (y == 0) :
        #         count = count+1

    # print(count)
    # if count == 0 : # if there is only one file
    #     count = 1
    print('Number of document in {0} = {1}'.format(dialect[0], len(corpus)))
    print('Number of document in {0} = {1}'.format(
        dialect[1], len(premodel.read_text(corpus_files[1]))))
    #print('The avg similarity between {0} and {1} is {2} '.format(dialect[0], dialect[1], summation/count))