def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = partial(utils.to_unicode, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [ preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f ] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [ preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f ] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]
def parse_df(df): df['parsed series'] = preprocess_documents(df['series'].astype('str')) df['parsed reviews'] = preprocess_documents(df['reviews'].astype('str')) df['parsed blurb'] = preprocess_documents(df['blurb'].astype('str')) df['parsed'] = df['parsed reviews'] + df['parsed blurb'] + df[ 'parsed series'] df = df.sort_values(by=['nratings'], ascending=False).reset_index(drop=True) return df
def clean_text_by_sentences(text, stemming): """ Tokenizes a given text into all_sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ original_sentences = split_sentences(text) if stemming: filtered_sentences = [join_words(sentence) for sentence in gensim_preprocessing.preprocess_documents(original_sentences)] else: nostem_filters = [f for f in gensim_preprocessing.DEFAULT_FILTERS if f != gensim_preprocessing.stem_text] filtered_sentences = [join_words(sentence) for sentence in gensim_preprocessing.preprocess_documents(original_sentences, filters=nostem_filters)] return merge_syntactic_units(original_sentences, filtered_sentences)
def processTrainingData(list_of_tweets): list_of_tweets = [prep.strip_short(line) for line in list_of_tweets ] # removes short words with less than 3 characters list_of_tweets = prep.preprocess_documents( list_of_tweets) # removes punctuation, numbers, whitespace, return list_of_tweets # each tweet is returned as a list
def get_response(utterance,genre_model): #preprocess utterence to remove unwanted characters and convert to lower case pre = utterance.lower().replace("-","").replace("?","").replace("'","").split() preprocess=preprocess_documents(pre) preprocessed=[str(i) for i in preprocess] #get doc2vec representation for utterance with respect to genre classifier model utterance_genre_vector = genre_model.infer_vector(preprocessed) #get predicted document(genre) sims = genre_model.docvecs.most_similar([utterance_genre_vector]) matched_genre=sims[0][0] #load in document sentence classification model and predict target sentence doc_model = Doc2Vec.load(model_dir+matched_genre+".model") #get doc2vec representation for utterance with respect to sentence classification model utterance_doc_vector = doc_model.infer_vector(preprocessed) #get predicted sentence sims = doc_model.docvecs.most_similar([utterance_doc_vector]) matched_sentence_index=sims[0][0] #get corresponding response to that sentence and return it f=open(data_dir+matched_genre) lines=f.readlines() response="" if (matched_sentence_index+1<len(lines)): response= lines[matched_sentence_index+1] else: response=lines[matched_sentence_index] return response.replace("-","")
def loadRedditData(): # Loads all saved Reddit posts df = pd.read_csv("data/reddit_todayilearned.csv") # Selects only the following columns df = df[[ "id", "author", "domain", "url", "num_comments", "score", "title", "retrieved_on", "over_18", "permalink", "created_utc", "link_flair_text" ]] # Leaves only the non-adult content df = df[~df["over_18"]] # Removes documents with lower than 10 score df = df[df["score"] > 10] # Resets the index df.reset_index(inplace=True, drop=True) # Creates a list of documents documents = df["title"].tolist() # Preprocesses the documents texts = preprocess_documents(documents) # Creates the dictionary dictionary = corpora.Dictionary(texts) # Creates the corpus using bag-of-words corpus = [dictionary.doc2bow(text) for text in texts] # Generates the TF-IDF model tfidf = models.TfidfModel(corpus) # Creates the TF-IDF corpus corpus_tfidf = tfidf[corpus] # Fits an LSI model (with 100 topics) model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15) # Identifies topics for each document corpus_wrapper = model[corpus_tfidf] # Creates the similarity index index = similarities.MatrixSimilarity(corpus_wrapper) return corpus_wrapper, index, df
def clean_text_by_word(text, deacc=True): """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- text : str Given text. deacc : bool, optional Remove accentuation if True. Returns ------- dict Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- >>> from gensim.summarization.textcleaner import clean_text_by_word >>> clean_text_by_word("God helps those who help themselves") {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def clean_text_by_word(text, deacc=True): """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- text : str Given text. deacc : bool, optional Remove accentuation if True. Returns ------- dict Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- .. sourcecode:: pycon >>> from gensim.summarization.textcleaner import clean_text_by_word >>> clean_text_by_word("God helps those who help themselves") {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def clean_text_by_sentences(text): """ Tokenizes a given text into sentences, applying filters and lemmatizing them. Returns a SyntacticUnit list. """ original_sentences = split_sentences(text) filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] return merge_syntactic_units(original_sentences, filtered_sentences)
def clean_text_by_sentences(text): original_sentences = split_sentences(text) filtered_sentences = [ join_words(sentence) for sentence in preprocess_documents(original_sentences) ] return merge_syntactic_units(original_sentences, filtered_sentences)
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora with open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(f) with open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(f) # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
def __iter__(self): for idx, doc in enumerate(self.doc_list): words = doc.lower().replace("-", "").replace("?", "").replace("'", "").split() #remove stop words , punctuation and numbers preprocess = preprocess_documents(words) #convert unicode to ascii string preprocessed = [str(i) for i in preprocess] yield LabeledSentence(preprocessed, tags=[self.labels_list[idx]])
def clean_text_by_word(text, deacc=True): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def clean_text_by_word(text): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return dict((unit.text, unit) for unit in units)
def __build_document(self): """ Open up the pre-processed document, read it in, use the gensim preprocess_document function to process it (see gensim for documentation). Then build a vocabulary based on the processed documents. """ with open('consolidated_nyt.tsv') as r: self.documents = r.read().splitlines() self.documents = preprocess_documents(self.documents) self.number_of_documents = len(self.documents) self.vocabulary = corpora.Dictionary(self.documents) self.vocabulary_size = len(self.vocabulary) print("Number of documents:" + str(len(self.documents))) print("Vocabulary size:" + str(self.vocabulary_size))
def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 pre_path = os.path.join(os.path.dirname(__file__), 'test_data') bg_corpus_file = 'lee_background.cor' corpus_file = 'lee.cor' sim_file = 'similarities0-1.txt' # read in the corpora latin1 = lambda line: utils.to_unicode(line, encoding='latin1') with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus = preprocess_documents(latin1(line) for line in f) with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] with utils.smart_open(os.path.join(pre_path, corpus_file)) as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
def prepare_corpus(documents): tic = time() # lower, strip tags, strip punctuation, strip multiple whitespaces, # strip numeric, remove stopwords, strip short, stem text texts = preprocessing.preprocess_documents(documents) #filter out hapax legomena all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] print(time() - tic) dictionary = corpora.Dictionary(texts) dictionary.save('texts/' + prefix + '/dictionary.dict') raw_corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('texts/' + prefix + "/corpus.mm", raw_corpus)
def clean_text_by_sentences(text): """Tokenize a given text into sentences, applying filters and lemmatize them. Parameters ---------- text : str Given text. Returns ------- list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` Sentences of the given text. """ original_sentences = split_sentences(text) filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)] return merge_syntactic_units(original_sentences, filtered_sentences)
def load_corpus(filename, d=3): import csv, sys # Increase max line length for csv.reader: max_int = sys.maxsize decrement = True while decrement: decrement = False try: csv.field_size_limit(max_int) except OverflowError: max_int = int(max_int / 10) decrement = True docs = [] labs = [] labelmap = dict() pat = re.compile("[A-Z]\d{2}") f = open(filename, 'r') reader = csv.reader(f) for row in reader: doc = row[1] lab = row[2] if len(lab) > 3: lab = lab.split(" ") lab = list(filter(lambda i: pat.search(i), lab)) lab = [partition_label(x, d) for x in lab] lab = [item for sublist in lab for item in sublist] lab = list(set(lab)) for x in lab: labelmap[x] = 1 else: lab = partition_label(lab, d) for x in lab: labelmap[x] = 1 # lab = [lab] docs.append(doc) labs.append(lab) f.close() print("Stemming documents ....") docs = gensimm.preprocess_documents(docs) return docs, labs, list(labelmap.keys())
logging.info('loading word mapping') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('loading corpus') corpus_bow = MmCorpus(working_corpus + '_bow.mm') logging.info("create log_ent model and save it to disk") tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize=True) tfidf.save(result_path + corpus_name + log_ent_extension) logging.info('load smal lee corpus and preprocess') raw_lee_texts = utils.get_txt(lee_corpus) preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts) bow_lee_texts = [ dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preproc_lee_texts ] logging.info('initialize LSI model') lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics) lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics) logging.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[tfidf[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi):
logging.info('loading word mapping') id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension) dictionary = Dictionary(word2id=word2id, id2word=id2word) logging.info('loading corpus') corpus_bow = MmCorpus(working_corpus + '_bow.mm') logging.info("create log_ent model and save it to disk") tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize = True) tfidf.save(result_path + corpus_name + log_ent_extension) logging.info('load smal lee corpus and preprocess') raw_lee_texts = utils.get_txt(lee_corpus) preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts) bow_lee_texts = [dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False) for text in preproc_lee_texts] logging.info('initialize LSI model') lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics) lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics) logging.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[tfidf[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi):
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name']) human_data_file = path.join(base_path, p['human_data_file']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() logger.info('loading word mapping') dictionary = Dictionary.load(path.join(base_path, p['corpus_path'], p['dict_name'])) Dictionary.save(dictionary, path.join(output_dir, p['dict_name'])) logger.info(dictionary) logger.info('loading corpus') corpus_bow = MmCorpus(working_corpus) logger.info("create preprocessing model and save it to disk") if p['pre_model'] == 'tfidf': pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) elif p['pre_model'] == 'log_ent': pre_model = LogEntropyModel(corpus_bow, id2word=dictionary, normalize=True) else: raise ValueError('model parameter %s not known' % p['pre_model']) pre_model.save(os.path.join(output_dir, p['pre_model_extension'])) logger.info('initialize LSI model') lsi = models.LsiModel(pre_model[corpus_bow], id2word=dictionary, num_topics=p['num_topics']) lsi.save(os.path.join(output_dir, p['lsi_extension'])) logger.info('finished --> lsi model saved to: %s' % os.path.join(output_dir, p['lsi_extension'])) # check for correlation with lee human data logger.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (LSI)') corpus_lsi = lsi[pre_model[bow_lee_texts]] # # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(human_data_file) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("correlation with lee human data: %f" % cor[0, 1]) dif = start - datetime.now() logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
from gensim.test.utils import common_texts from gensim.corpora.dictionary import Dictionary import gensim import pandas as pd from gensim.parsing.preprocessing import preprocess_documents from multiprocessing import Pool from functools import partial import math import numpy as np # use the newsgroup data as corpus df = pd.read_json( "https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json" ) documents = df.content.tolist() documents = preprocess_documents(documents) # fit an LDA model, n_topic = 5 news_dictionary = Dictionary(documents) news_dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=5000, keep_tokens=None) corpus = [news_dictionary.doc2bow(text) for text in documents] lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary) lda.show_topics() # convert gensim corpus to a sparse document-term matrix for coherence measure corpus_dense = gensim.matutils.corpus2csc(corpus, num_terms=len(
model = gensim.models.Doc2Vec( size=200, min_count=1, alpha=0.025, min_alpha=0.001, dm_concat=1 ) #,window=7)#,train_words=True,learn_doctags =True,learn_words =True) model.build_vocab(it) model.train(it, total_examples=model.corpus_count, epochs=50) #best is 48 model.save(model_dir + "genre.model") #evaluate model accuracy over entire documents true_pred = 0 for x in documents: with open(data_dir + x, 'r') as f: utterance = f.read() preprocessed = utterance.lower().replace("-", "").replace("?", "").replace( "'", "").split() preprocess = preprocess_documents(preprocessed) preprocessed = [str(i) for i in preprocess] utterance_genre_vector = model.infer_vector(preprocessed) sims = model.docvecs.most_similar([utterance_genre_vector]) if (sims[0][0] == x): true_pred = true_pred + 1 accuracy = true_pred / len(documents) print("model accuracy over entire document is: " + str(accuracy)) #evaluate model accuracy over entire documents true_pred = 0 total_sentences = 0 for x in documents: with open(data_dir + x, 'r') as f: document = f.read().splitlines() for line in document: total_sentences = total_sentences + 1
def generate(db_uri, min_contexts=4, preprocess=False): """ Generate a list of citation contexts, given criteria: min_contexts preprocess (preprocess_documents default; if off only punctuation and multiple whitespaces are removed) """ Base = declarative_base() engine = create_engine(db_uri) Base.metadata.create_all(engine) Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() CitContext = Table('papercitationcontexts', Base.metadata, autoload=True, autoload_with=engine) print('querying DB') non_unique = session.query(CitContext.columns.paperreferenceid).\ group_by(CitContext.columns.paperreferenceid).\ having(func.count(CitContext.columns.paperreferenceid) > min_contexts-1).\ subquery() cit_contexts_db = session.query(CitContext).\ filter(CitContext.columns.paperreferenceid.in_(non_unique)).\ all() # order_by(BibitemArxivIDMap.arxiv_id.desc()).all() print(len(cit_contexts_db)) # 187595127 print(dir(cit_contexts_db[0])) # ['__add__', '__class__', '__contains__', '__delattr__', '__dir__', # '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', # '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', # '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', # '__module__', '__mul__', '__ne__', '__new__', '__reduce__', # '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__', # '__slots__', '__str__', '__subclasshook__', '_asdict', '_fields', # '_real_fields', 'citationcontext', 'count', 'index', 'keys', 'paperid', # 'paperreferenceid'] # # TODO # Work in progress. ↑ MAG DB results ↓ stuff to adjust for generating # a dataset # sys.exit() print('merging bibitems') cited_docs_pre = {} uuid_aid_map = {} for bibitem in bibitems: if global_ids == 'mag': aid = bibitem.BibitemMAGIDMap.mag_id else: aid = bibitem.BibitemArxivIDMap.arxiv_id uuid = bibitem.Bibitem.uuid uuid_aid_map[uuid] = aid in_doc = bibitem.Bibitem.in_doc if aid not in cited_docs_pre: cited_docs_pre[aid] = {} if in_doc not in cited_docs_pre[aid]: cited_docs_pre[aid][in_doc] = [] cited_docs_pre[aid][in_doc].append(uuid) print('checking merging results') cited_docs = {} for aid, doc_dict in cited_docs_pre.items(): # for evaluation we *need* at least 2 documents containing citation # contexts (in order to perform a per doc test/train split) if len(doc_dict) > 1: cited_docs[aid] = [] for in_doc, uuid_list in doc_dict.items(): cited_docs[aid].append({ 'uuid': uuid_list[0], # uuid_list should always be len. 1 'in_doc': in_doc }) print('going through docs') contexts = [] for aid, doc_list in cited_docs.items(): tmp_list = [] num_docs = 0 for doc in doc_list: in_doc = doc['in_doc'] fn = '{}.txt'.format(in_doc) text_file = os.path.join(in_dir, fn) with open(text_file) as f: text = f.read() marker = '{{{{cite:{}}}}}'.format(doc['uuid']) marker_found = False for m in re.finditer(marker, text): margin = int(context_size / 2) idx = m.start() edx = m.end() pre = text[:idx] post = text[edx:] adj_pre = find_adjacent_citations(pre, uuid_aid_map, backwards=True) adj_post = find_adjacent_citations(post, uuid_aid_map) adjacent_citations = adj_pre + adj_post pre = re.sub(CITE_PATT, '', pre) post = re.sub(CITE_PATT, '', post) # heuristic pre-cutting (10 times average word length) pre = pre[-margin * 6 * 10:] post = post[:margin * 6 * 10] if preprocess: pre, post = preprocess_documents([pre, post]) else: custom_filter = [ strip_punctuation, strip_multiple_whitespaces ] pre = preprocess_string(pre, custom_filter) post = preprocess_string(post, custom_filter) placeholder = '' if with_placeholder: placeholder = ' [] ' context = '{}{}{}'.format(' '.join(pre[-margin:]), placeholder, ' '.join(post[:margin])) adj_cit_str = '[{}]'.format('|'.join(adjacent_citations)) tmp_list.append([aid, adj_cit_str, in_doc, context]) marker_found = True if marker_found: num_docs += 1 if len(tmp_list) >= min_contexts and num_docs > 1: contexts.extend(tmp_list) print(len(contexts)) with open('items.csv', 'w') as f: for vals in contexts: line = '{}\n'.format(','.join(vals)) f.write(line)
def processTrainingData(list_of_tweets): list_of_tweets = [prep.strip_short(line) for line in list_of_tweets] list_of_tweets = prep.preprocess_documents(list_of_tweets) return list_of_tweets
def main(param_file=None): # setup p, base_path, output_dir = tools.setup(param_file) result_path = path.join(base_path, p['result_path']) lee_corpus = path.join(base_path, p['lee_corpus']) logger = tools.get_logger('gensim', path.join(output_dir, "run.log")) logger.info("running %s" % ' '.join(sys.argv)) # remember starting time for runtime evaluation start = datetime.now() # load model and corpus logger.info('loading word mapping') dictionary = Dictionary.load(path.join(result_path, p['run'], p['dict_extension'])) model_path = path.join(result_path, p['run'], p['lsi_ext']) logger.info('load model from: %s' % model_path) lsi = LsiModel.load(model_path) pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext'])) logging.info('load smal lee corpus and preprocess') with open(lee_corpus, 'r') as f: preproc_lee_texts = preprocessing.preprocess_documents(f.readlines()) bow_lee_texts = [dictionary.doc2bow(text, allow_update=False, return_missing=False) for text in preproc_lee_texts] logger.info('transforming small lee corpus (only pre model)') corpus_pre = pre[bow_lee_texts] # read the human similarity data and flatten upper triangular human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file'])) sim_m_size = np.shape(human_sim_matrix)[0] human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)] max_topics = lsi.num_topics logger.info("iterate from %d to %d dimensions (stepsize: %d)" % (p['min_dim'], max_topics, p['dim_step'])) iter_range = range(p['min_dim'], max_topics, p['dim_step']) res = np.zeros(len(iter_range)) for k, l in enumerate(iter_range): # do the lower dimensionality transformation lsi.num_topics = l corpus_lsi = lsi[corpus_pre] # compute pairwise similarity matrix of transformed corpus sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi))) for i, par1 in enumerate(corpus_lsi): for j, par2 in enumerate(corpus_lsi): sim_matrix[i, j] = matutils.cossim(par1, par2) sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)] # compute correlations cor = np.corrcoef(sim_vector, human_sim_vector) logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1])) res[k] = cor[0, 1] plt.figure() plt.plot(iter_range, res) plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension'])) plt.close() np.save(path.join(output_dir, 'model_dim_res.npy'), res) dif = datetime.now() - start logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
def LSI(polarity_cleaned_data, LSI_input): df = pd.read_json(polarity_cleaned_data, orient='split') text_corpus = df['Comment'] processed_corpus = preprocess_documents(text_corpus) dictionary = gensim.corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] lsi = gensim.models.LsiModel(bow_corpus, num_topics=200) index = gensim.similarities.MatrixSimilarity(lsi[bow_corpus]) new_doc = gensim.parsing.preprocessing.preprocess_string(LSI_input) new_vec = dictionary.doc2bow(new_doc) vec_bow_tfidf = lsi[new_vec] sims = index[vec_bow_tfidf] comment_list = [] cosine_similarity = [] comment_polarity = [] comment_subjectivity = [] comment_upvotes = [] for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]: comment_list.append(f"{df['Comment'].iloc[s[0]]}") cosine_similarity.append(s[1]) comment_polarity.append(df['Polarity'].iloc[s[0]]) comment_subjectivity.append(df['Subjectivity'].iloc[s[0]]) comment_upvotes.append(df['Upvotes'].iloc[s[0]]) d = { 'Cosine Similarity': cosine_similarity, 'Comments': comment_list, 'Polarity': comment_polarity, 'Subjectivity': comment_subjectivity, 'Upvotes': comment_upvotes } LSI_df = pd.DataFrame(d) ## averages for top 10 comment results columns = ['Polarity', 'Subjectivity', 'Cosine Similarity'] averages = [ round(LSI_df['Polarity'].mean(), 2), round(LSI_df['Subjectivity'].mean(), 2), round(LSI_df['Cosine Similarity'].mean(), 2) ] fig5 = go.Figure( data=[go.Bar(x=columns, y=averages, marker=dict(color='#ffb300'))]) fig5.update_layout( font=dict(color='#ff9100'), title='Statistical Averages for Search Results', xaxis=dict( title='Comments (from highest cosine similarity to lowest)', ), yaxis=dict( title='Polarity, Subjectivity and Cosine Similarity Averages', gridcolor='darkgray'), plot_bgcolor='#212121', paper_bgcolor='#212121'), fig5.update_traces(opacity=.75) return html.Div(children=[ html.Div(children=[ dash_table.DataTable( columns=[{ 'name': i, 'id': i } for i in LSI_df.columns], style_table={'overflow': 'auto'}, data=LSI_df.to_dict('records'), style_cell={ 'textAlign': 'left', 'whiteSpace': 'normal', 'font-family': 'Helvetica', 'font-weight': 'ligher', 'height': 'auto', 'backgroundColor': '#1a1a1a', 'color': 'darkgray' }, style_header={ 'font-weight': 'bold', }, css=[{ 'selector': '.dash-spreadsheet td div', 'rule': ''' line-height: 15px; max-height: 30px; min-height: 30px; height: 30px; display: block; overflow-y: hidden; ''' }], tooltip_duration=None, tooltip_data=[{ column: { 'value': str(value), 'type': 'markdown' } for column, value in row.items() } for row in LSI_df.to_dict('records')], ) ], className='datatable'), html.Div(className='LSI-bar', children=[dcc.Graph(figure=fig5)]) ])
# Latent schematic indexing (LSI) is an nlp technique used to find similar # pieces of text. In this example, given a corpus of movie reviews, one can # find a review most similar to an input string. LSI works by using singular # value decomposition (SVD) which is non centered PCA. It is good at dealing # with synonymy and polysemy in languages but is computationally expensive, and # as such is not recommended for processing documents in bulk. import pandas as pd import gensim from gensim.parsing.preprocessing import preprocess_documents df = pd.read_csv(‘wiki_movie_plots_deduped.csv’, sep=’,’) df = df[df[‘Release Year’] >= 2000] text_corpus = df[‘Plot’].values processed_corpus = preprocess_documents(text_corpus) dictionary = gensim.corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] tfidf = gensim.models.TfidfModel(bow_corpus, smartirs=’npu’) corpus_tfidf = tfidf[bow_corpus] # num_topics is a hyperparameter that can be fine tuned using Topic Coherence measure lsi = gensim.models.LsiModel(corpus_tfidf, num_topics=200) index = gensim.similarities.MatrixSimilarity(lsi[corpus_tfidf]) new_doc = gensim.parsing.preprocessing.preprocess_string(new_doc) new_vec = dictionary.doc2bow(new_doc) vec_bow_tfidf = tfidf[new_vec] vec_lsi = lsi[vec_bow_tfidf] sims = index[vec_lsi] for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]: