import settings, logging, nltk.data, math, datetime from filing_iterator import filings_iterator from gensim.models import doc2vec # some config parameters logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) year_start = 2000 year_end = 2015 num_features = 100 model_name = "sec_filings_model_{}.d2v".format(num_features) # prepare training data sentences = filings_iterator(tokenizer = nltk.data.load('tokenizers/punkt/english.pickle'), N = 10000, useDB = True, year_start = year_start, year_end = year_end) model = doc2vec.Doc2Vec(sentences, size = num_features, min_count = 1, seed = 5, window = 20, sample = 1e-3, #hashfxn = analyze.hash32 workers = 4) #model.init_sims(replace=True) #model.save(model_name) # ============================================================== # retrieve features for each document and store them in a file # ==============================================================
''' Created on Aug 29, 2015 @author: akshaym ''' import settings, logging, nltk.data from filing_iterator import filings_iterator # some config parameters logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) year_start = 2000 year_end = 2015 n_docs = 1 num_features = 50 model_name = "sec_filings_model_{}.d2v".format(num_features) # prepare training data filings_it = filings_iterator(tokenizer = nltk.data.load('tokenizers/punkt/english.pickle'), N = 9681, useDB = True, year_start = year_start, year_end = year_end) max_len = 0 for doc in filings_it.get_filing_without_stopwords_from_db(): if (len(doc) > max_len): max_len = len(doc.split(" ")) print(max_len)
import settings, logging, nltk.data, math, datetime from filing_iterator import filings_iterator from gensim.models import doc2vec # some config parameters logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) year_start = 2000 year_end = 2015 num_features = 100 model_name = "sec_filings_model_{}.d2v".format(num_features) # prepare training data sentences = filings_iterator( tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), N=10000, useDB=True, year_start=year_start, year_end=year_end) model = doc2vec.Doc2Vec( sentences, size=num_features, min_count=1, seed=5, window=20, sample=1e-3, #hashfxn = analyze.hash32 workers=4) #model.init_sims(replace=True) #model.save(model_name)
Created on Aug 29, 2015 @author: akshaym ''' import settings, logging, nltk.data from filing_iterator import filings_iterator # some config parameters logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\ level=logging.INFO) year_start = 2000 year_end = 2015 n_docs = 1 num_features = 50 model_name = "sec_filings_model_{}.d2v".format(num_features) # prepare training data filings_it = filings_iterator( tokenizer=nltk.data.load('tokenizers/punkt/english.pickle'), N=9681, useDB=True, year_start=year_start, year_end=year_end) max_len = 0 for doc in filings_it.get_filing_without_stopwords_from_db(): if (len(doc) > max_len): max_len = len(doc.split(" ")) print(max_len)