Python TextCorpus.TextCorpus 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: gensim.corpora

클래스/타입: TextCorpus

메소드/함수: TextCorpus

hotexamples.com에서의 예제들: 10

Python TextCorpus.TextCorpus - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 gensim.corpora.TextCorpus.TextCorpus에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TextCorpus(10)

get_texts(3)

load(2)

__init__(1)

save(1)

예제 #1

파일 보기

def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel

예제 #2

파일 보기

파일: lsamodel.py 프로젝트: jyt109/Spark-Gensim

def pretrain():
    """pre train the text corpus and build the dictionary"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    gutenberg_corpus.dictionary.save(dict_file)
    gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
    mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
    print mm

예제 #3

파일 보기

파일: num_translations_feature_extractor.py 프로젝트: fredblain/marmot

 def __init__(self, lex_prob_file, corpus_file):
     self.lex_prob = defaultdict(list)
     for line in open(lex_prob_file):
         chunks = line[:-1].split()
         self.lex_prob[chunks[1]].append(float(chunks[2]))
     corpus = TextCorpus(input=corpus_file)
     self.corpus_freq = FreqDist([word for line in corpus.get_texts() for word in line])
     self.thresholds = [0.01, 0.05, 0.1, 0.2, 0.5]

예제 #4

파일 보기

파일: lsamodel.py 프로젝트: jyt109/Spark-Gensim

def train(text_corpus_file, dict_file):
    """train lsi model from text corpus"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    dict = Dictionary.load(dict_file)
    lsi = LsiModel(corpus=gutenberg_corpus, id2word=dict, num_topics=400)
    lsi.save(model_file)
    print lsi.projection.u
    print lsi.projection.u.size
    print lsi.projection.u[0].size

예제 #5

파일 보기

def train_gensim():
	from gensim.corpora import TextCorpus
	from gensim.corpora.textcorpus import lower_to_unicode
	from gensim.models import Word2Vec as GensimWord2Vec

	start = time()

	stopwords = []
	if args.stop_word_lang:
		# starting spark only for this...
		spark = SparkSession.builder.appName("load stop words").getOrCreate()
		stopwords += StopWordsRemover.loadDefaultStopWords(args.stop_word_lang)
		spark.sparkContext.stop()
	if args.stop_word_file:
		with open(args.stop_word_file) as stop_word_file:
			stopwords += [word.strip("\n") for word in stop_word_file.readlines()]

	def remove_stopwords(tokens):
		return [token for token in tokens if token not in stopwords]

	corpus = TextCorpus(
		args.txtPath,
		dictionary={None: None},
		character_filters=[lower_to_unicode],
		token_filters=[remove_stopwords]
	)

	model = GensimWord2Vec(
		seed=1,
		alpha=args.step_size,
		size=args.vector_size,
		window=args.window_size,
		sample=1e-6,
        sg=1
	)
	model.build_vocab(corpus.get_texts())
	model.train(corpus.get_texts(), total_examples=model.corpus_count, epochs=model.epochs)
	model.save(args.modelPath)

	end = time()
	print("Gensim training took {} seconds".format(end - start))

예제 #6

파일 보기

파일: make_corpus_lda1.py 프로젝트: MikeLepekhin/Non-thematic-Text-Classification

if len(sys.argv) > 3:
    ntopics = int(sys.argv[3])

if len(sys.argv) > 4:
    keep_words = int(sys.argv[4])
else:
    keep_words = DEFAULT_DICT_SIZE

if os.path.exists(outp +
                  '_wordids.txt.bz2') and os.path.exists(outp +
                                                         '_corpus.pkl.bz2'):
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki = TextCorpus.load(outp + '_corpus.pkl.bz2')
else:
    wiki = TextCorpus(inp)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=keep_words)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
else:
    tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)

예제 #7

파일 보기

__author__ = 'Marci'

import logging, sys, pprint
from gensim.corpora import TextCorpus, MmCorpus, Dictionary

# Set logging for gensim
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# gensim docs: "Provide a filename or a file-like object as input and TextCorpus will be initialized with a
# dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only
# need to override `get_texts` and provide your own implementation."
background_corpus = TextCorpus(input=YOUR_CORPUS)

# Important -- save the dictionary generated by the corpus, or future operations will not be able to map results
# back to original words.
background_corpus.dictionary.save("my_dict.dict")

MmCorpus.serialize(
    "background_corpus.mm", background_corpus
)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.

### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

# This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus = WikiCorpus(articles)
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.

예제 #8

파일 보기

파일: oov_feature_extractor.py 프로젝트: tien-le-grenoble/marmot

 def __init__(self, corpus_file):
     corpus = TextCorpus(input=corpus_file)
     self.words = corpus.dictionary.values()

예제 #9

파일 보기

from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from gensim.models import TfidfModel
from gensim.models.ldamodel import LdaModel
from gensim.models.hdpmodel import HdpModel
import bz2

out = '/home/mjg/data/descriptions'

# Form corpus
corpus = TextCorpus(bz2.BZ2File(out + '.bz2'))

# remove common words
stoplist = set(
    'a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your'
    .split(','))
stop_ids = [
    corpus.dictionary.token2id[stopword] for stopword in stoplist
    if stopword in corpus.dictionary.token2id
]
corpus.dictionary.filter_tokens(stop_ids)

# only keep the most frequent words
corpus.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=100000)
# save stuff
MmCorpus.serialize(out + '_bow.mm', corpus, progress_cnt=10000)
corpus.dictionary.save_as_text(out + '_wordids.txt.bz2')
# save memory
dictionary = Dictionary.load_from_text(out + '_wordids.txt.bz2')
del corpus

# initialize corpus reader and word->id mapping

예제 #10

파일 보기

load_dotenv('./.env')

# logging
import logging
logger = logging.getLogger()
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
logger.addHandler(ch)

logger.info("Setting up app!")

app = Flask(__name__)
CORS(app)

# setup model stuff
corpus = TextCorpus('jobspicker/jobspicker-descriptions.csv')
corpus.dictionary.filter_extremes(no_below=4, no_above=.9, keep_n=100000)
sentences = [list(g) for g in list(corpus.get_texts())]
tfidf = TfidfModel(corpus)
model = Word2Vec.load("profiles.model")
corp_vecs = corpus_vec(sentences, model, corpus)

# create simple helper functions
get_vec = lambda t: sentence_to_vec(t, model, corpus, tfidf)
get_job = lambda v: get_closest_doc(v, corp_vecs, sentences)

# our database of bayesopt models
user_models = {}

@app.route('/init/<i>')
def init(i):