def build_comparable_ldamodel_training(comp_folder, dialect): counter = 0 folders = [comp_folder + dialect[0] + '/', comp_folder + dialect[1] + '/'] for folder in folders: for file in os.listdir(folder): extension = os.path.splitext(file)[1] if extension == '.txt': filepath = os.path.join(folder, file) texts = premodel.read_text(filepath) if counter == 0: dictionary = corpora.Dictionary(texts) else: dictionary.add_documents(texts) counter = counter + 1 dictionary.save('parameters/comp_LDAmodel_' + dialect[1] + '.dict') corpus = [ dictionary.doc2bow(text) for text in list(premodel.read_set_of_file(folders[0])) ] corpus = corpus + [ dictionary.doc2bow(text) for text in list(premodel.read_set_of_file(folders[1])) ] corpora.MmCorpus.serialize('parameters/comp_LDAmodel_' + dialect[1] + '.mm', corpus) # store to disk, for later use return dictionary, corpus
def training_phase(folder, dialect): counter = 0 for file in os.listdir(folder): extension = os.path.splitext(file)[1] if extension == '.txt': filepath = os.path.join(folder, file) texts = premodel.read_text(filepath) if counter == 0: dictionary = corpora.Dictionary(texts) else: dictionary.add_documents(texts) #ct.add_documents #print(len(dictionary)) # Bag of words #dictionary = corpora.Dictionary(texts) dictionary.compactify( ) # remove gaps in id sequence after words that were removed dictionary.save('parameters/' + dialect[0] + '_' + dialect[1] + '.dict') # - collect statistics about all tokens (trainig_data) corpus = [ dictionary.doc2bow(text) for text in list(premodel.read_set_of_file(folder)) ] corpora.MmCorpus.serialize('parameters/' + dialect[0] + '_' + dialect[1] + '.mm', corpus) # store to disk, for later use return dictionary, corpus
def compute_lsi(folder, dialect, corpus_files): dictionary, corpus = models.training_phase(folder, dialect) corpus_lsi, lsi = models.build_lsi_model(corpus, dictionary) # summation = 0 # with open(corpus_files[1], encoding='utf-8') as f: # we can define file_name # documents = f.read().splitlines() #count = 0 for document in premodel.read_full_text(corpus_files[1]): vec_lsi = models.test_corpus(' '.join(document), lsi, dictionary) models.compute_similarity(vec_lsi, corpus_lsi, dialect) print('Number of document in {0} = {1}'.format(dialect[0], len(corpus))) print('Number of document in {0} = {1}'.format( dialect[1], len(premodel.read_text(corpus_files[1]))))
def build_ldamodel_training(folder, dialect): counter = 0 for file in os.listdir(folder): extension = os.path.splitext(file)[1] if extension == '.txt': filepath = os.path.join(folder, file) texts = premodel.read_text(filepath) if counter == 0: dictionary = corpora.Dictionary(texts) else: dictionary.add_documents(texts) counter = counter + 1 # texts = [['bank', 'river', 'shore', 'water'], # ['river', 'water', 'flow', 'fast', 'tree'], # ['bank', 'water', 'fall', 'flow'], # ['bank', 'bank', 'water', 'rain', 'river'], # ['river', 'water', 'mud', 'tree'], # ['money', 'transaction', 'bank', 'finance'], # ['bank', 'borrow', 'money'], # ['bank', 'finance'], # ['finance', 'money', 'sell', 'bank'], # ['borrow', 'sell'], # ['bank', 'loan', 'sell']] # dictionary = Dictionary(texts) dictionary.save('parameters/LDAmodel_' + dialect[1] + '.dict') corpus = [ dictionary.doc2bow(text) for text in list(premodel.read_set_of_file(folder)) ] corpora.MmCorpus.serialize('parameters/LDAmodel_' + dialect[1] + '.mm', corpus) # store to disk, for later use return dictionary, corpus
corpus_files = [folder + dialect[0] + '.txt', folder + dialect[1] + '.txt'] dictionary_memory_friendly, corpus_memory_friendly = lsi_model.training_phase( folder, dialect) dictionary, corpus = premodel.upload_data(dialect) corpus_lsi, lsi = lsi_model.build_lsi_model(corpus, dictionary) summation = 0 with open(corpus_files[1], encoding='utf-8') as f: # we can define file_name documents = f.read().splitlines() count = 0 for document in premodel.read_full_text(corpus_files[1]): vec_lsi = lsi_model.test_corpus(' '.join(document), lsi, dictionary) similarity = lsi_model.compute_similarity(vec_lsi, corpus_lsi, dialect) # compute the avg similarities cross the corpus #summation = summation + sum(y for _, y in similarity if y > 0) # for x,y in similarity : # if not (y == 0) : # count = count+1 # print(count) # if count == 0 : # if there is only one file # count = 1 print('Number of document in {0} = {1}'.format(dialect[0], len(corpus))) print('Number of document in {0} = {1}'.format( dialect[1], len(premodel.read_text(corpus_files[1])))) #print('The avg similarity between {0} and {1} is {2} '.format(dialect[0], dialect[1], summation/count))