def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def pretrain(): """pre train the text corpus and build the dictionary""" gutenberg_corpus = TextCorpus(text_corpus_file) gutenberg_corpus.dictionary.save(dict_file) gutenberg_corpus.dictionary.save_as_text(dic_txt_file) mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus) print mm
def __init__(self, lex_prob_file, corpus_file): self.lex_prob = defaultdict(list) for line in open(lex_prob_file): chunks = line[:-1].split() self.lex_prob[chunks[1]].append(float(chunks[2])) corpus = TextCorpus(input=corpus_file) self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line]) self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]
def train(text_corpus_file, dict_file): """train lsi model from text corpus""" gutenberg_corpus = TextCorpus(text_corpus_file) dict = Dictionary.load(dict_file) lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400) lsi.save(model_file) print lsi.projection.u print lsi.projection.u.size print lsi.projection.u[0].size
def train_gensim(): from gensim.corpora import TextCorpus from gensim.corpora.textcorpus import lower_to_unicode from gensim.models import Word2Vec as GensimWord2Vec start = time() stopwords = [] if args.stop_word_lang: # starting spark only for this... spark = SparkSession.builder.appName("load stop words").getOrCreate() stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang) spark.sparkContext.stop() if args.stop_word_file: with open(args.stop_word_file) as stop_word_file: stopwords += [word.strip("\n") for word in stop_word_file.readlines()] def remove_stopwords(tokens): return [token for token in tokens if token not in stopwords] corpus = TextCorpus( args.txtPath, dictionary={None: None}, character_filters=[lower_to_unicode], token_filters=[remove_stopwords] ) model = GensimWord2Vec( seed=1, alpha=args.step_size, size=args.vector_size, window=args.window_size, sample=1e-6, sg=1 ) model.build_vocab(corpus.get_texts()) model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs) model.save(args.modelPath) end = time() print("Gensim training took {} seconds".format(end - start))
if len(sys.argv) > 3: ntopics = int(sys.argv[3]) if len(sys.argv) > 4: keep_words = int(sys.argv[4]) else: keep_words = DEFAULT_DICT_SIZE if os.path.exists(outp + '_wordids.txt.bz2') and os.path.exists(outp + '_corpus.pkl.bz2'): dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') wiki = TextCorpus.load(outp + '_corpus.pkl.bz2') else: wiki = TextCorpus(inp) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm') else: tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)
__author__ = 'Marci' import logging, sys, pprint from gensim.corpora import TextCorpus, MmCorpus, Dictionary # Set logging for gensim logging.basicConfig(stream=sys.stdout, level=logging.INFO) # gensim docs: "Provide a filename or a file-like object as input and TextCorpus will be initialized with a # dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only # need to override `get_texts` and provide your own implementation." background_corpus = TextCorpus(input=YOUR_CORPUS) # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results # back to original words. background_corpus.dictionary.save("my_dict.dict") MmCorpus.serialize( "background_corpus.mm", background_corpus ) # Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs. ### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus = WikiCorpus(articles) wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs.
def __init__(self, corpus_file): corpus = TextCorpus(input=corpus_file) self.words = corpus.dictionary.values()
from gensim.corpora import TextCorpus, MmCorpus, Dictionary from gensim.models import TfidfModel from gensim.models.ldamodel import LdaModel from gensim.models.hdpmodel import HdpModel import bz2 out = '/home/mjg/data/descriptions' # Form corpus corpus = TextCorpus(bz2.BZ2File(out + '.bz2')) # remove common words stoplist = set( 'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your' .split(',')) stop_ids = [ corpus.dictionary.token2id[stopword] for stopword in stoplist if stopword in corpus.dictionary.token2id ] corpus.dictionary.filter_tokens(stop_ids) # only keep the most frequent words corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000) # save stuff MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000) corpus.dictionary.save_as_text(out + '_wordids.txt.bz2') # save memory dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2') del corpus # initialize corpus reader and word->id mapping
load_dotenv('./.env') # logging import logging logger = logging.getLogger() ch = logging.StreamHandler() ch.setLevel(logging.INFO) logger.addHandler(ch) logger.info("Setting up app!") app = Flask(__name__) CORS(app) # setup model stuff corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv') corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000) sentences = [list(g) for g in list(corpus.get_texts())] tfidf = TfidfModel(corpus) model = Word2Vec.load("profiles.model") corp_vecs = corpus_vec(sentences, model, corpus) # create simple helper functions get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf) get_job = lambda v: get_closest_doc(v, corp_vecs, sentences) # our database of bayesopt models user_models = {} @app.route('/init/<i>') def init(i):