def stopwords(): cursor, conn = SQLite.connect_to_databse(database_location) datatable = SQLite.list_all_rows(cursor, 'papers') paperdata = [] for row in datatable: paperdata.append(row[6]) cleandoc = cleanupdocuments(paperdata) id2word = corpora.Dictionary(cleandoc) corpus = [id2word.doc2bow(text) for text in cleandoc] dictionary = corpora.Dictionary(cleandoc) pickle.dump(corpus, open('ALL_corpus.pkl', 'wb')) dictionary.save('ALL_dictionary.gensim') cp_all = [] for i in range (0,len(cleandoc)): for j in range (0,len(cleandoc[i])): cp_all.append(cleandoc[i][j]) cleandoc.insert(0,cp_all) print('done part dos, **thumbs up**') dct = Dictionary.load('ALL_dictionary.gensim') corpus = [dct.doc2bow(line) for line in cleandoc] model = TfidfModel(corpus) vector = model[corpus[0]] print('done part tres, **smiley face**') cp_stop = [] for token_id, token_weight in vector: cp_stop.append((dct.get(token_id),token_weight)) print('done part quatros, yeehaw!') import csv headers = ('word','score') with open('stopwords.csv','w',newline='',encoding='utf-8') as outFile: wtr = csv.writer(outFile) wtr.writerow(headers) wtr.writerows(cp_stop) with open('stopwords.csv', 'r', newline='', encoding='utf-8') as inFile: csvreader = csv.reader(inFile) itr = iter(csvreader) next(itr) # stopwordvalue = [row for row in itr if float(row[1]) > 0.007] stopwordvalue = [row for row in itr if float(row[1]) > float(sys.argv[2])] with open('stopwords.csv','w',newline='',encoding='utf-8') as OutFile: wt = csv.writer(OutFile) wt.writerow(headers) wt.writerows(stopwordvalue) print('STOP WORDS FOUND!!! Stored in stopwords.csv')
import matplotlib as plt import numpy as np import time import sys # ============================================================================= # User variables # Location of SQLite database # ============================================================================= database_location = '/home/greenbur/NLP/Python Code/WorkingPapersGOMlg.sqlite' # Path where model will be saved savemodelpath = '/home/greenbur/NLP/Results/GOMlgvec.txt' # Load document data from database # connect to swlite database and load data cursor, conn = SQLite.connect_to_databse(database_location) datatable = SQLite.list_all_rows(cursor, 'papers') # Collect paper text and load to python list paperdata = [] for row in datatable: paperdata.append(row[6]) # Clean text for processing cleandoc = cleanupdocuments(paperdata) print("Documents loaded and ready to process") # This section builds the Word2Vec model and saves the model print("Starting word2vec") # Build Word2Vec model, params adjjusted for future testing