# create logging to console set_logger() logging.info('Start: {} '.format(__file__)) # create database connection db = MongoDatabase() # name of collection to store all the training tweets to db_collection = 'training_tweets' # location to save machine learning classification models to model_save_location = os.path.join('files', 'ml_models2') # get all the training tweet documents D = db.read_collection(collection=db_collection) # get values from list and assign to X and Y X, Y = zip(*[(x['text'], str(x['label'])) for x in D]) # define pipeline options pipeline_setup = get_pipeline_setup() # define grid search parameters and values grid_setup = get_grid_setup() # create and save the model execute_gridsearch_cv(X, Y, test_size=.2, shuffle=True, pipeline_setup=pipeline_setup, grid_setup=grid_setup, cv=10, n_jobs=cpu_count(), scoring='f1_weighted', verbose=10, save_model=True, model_save_location=model_save_location)
db = MongoDatabase() # execute if set to True if filter_tweets: """ Filter raw target tweet - remove non-English tweets - remove retweet - remove tweets that do not originate from an academic or scientist (by using bio text) raw tweets are stored in the collection 'raw_tweets' filtered tweets will be stored in the collectin 'filtered_tweets' """ # read tweets documents from database D = db.read_collection(collection='raw_tweets') # tracker to keep track of processed tweet IDs (in case we want to repeat the process for a set of new tweets) tweet_tracker = set([ '{}{}'.format(x['tweet_type'], x['id']) for x in db.read_collection(collection='filtered_tweets') ]) # read academic/scientists professions (so we can filter the bio on these words) academic_words = [ x.strip('\n').strip('\r').lower() for x in read_plain_text( os.path.join('files', 'filter_bio', 'academic_words.txt'), read_lines=True) ] # loop over each tweet document
db = MongoDatabase() # location of target tweets location_tweets = os.path.join('files', 'target_tweets') # process tweets for each mode of research for mode in MoR: logging.info('Processing mode of research: {}'.format(mode)) # read tweets files F = read_directory(os.path.join(location_tweets, mode)) # tracker to keep track of processed tweet ids tweet_tracker = set(['{}{}'.format(x['tweet_type'], x['id']) for x in db.read_collection(collection='raw_tweets')]) # loop over each file, read content, parse relevant fields, save to db for i, f in enumerate(F): logging.info('Processing file {} {}/{}'.format(f, i + 1, len(F))) # read tweets from file (as list) tweets = read_plain_text(f, read_lines=True) # loop over each tweet for tweet in tweets: # convert string to json tweet = json.loads(tweet)
# create logging to console set_logger() # verbose logging.info('Start: {} '.format(__file__)) # create database connection db = MongoDatabase() # load classifier clf = joblib.load(os.path.join('files', 'ml_models', 'LinearSVC.pkl')) # read labels for target tweets that have been manually labeled and convert to dictionary with key = tweet ID and value = label true_labels = { d['tweet_id']: d['label'] for d in db.read_collection(collection='manual_tweets_raw') } # load tweets for which we want to infer the sentiment label D = db.read_collection(collection='target_tweets') # create empty numpy array so we can retrieve labels later on somewhat faster labels = np.zeros((D.count(), 3), dtype=np.int) # loop over each target tweet for i, d in enumerate(D): logging.debug(' - Processing tweet {}/{}'.format(i + 1, D.count())) # check if we have a true label for the target tweet, if so, skip prediction and use true label if d['tweet_id'] in true_labels:
Tweets are saved into the collection 'sanders_tweets_raw' """ # name of the collection to store tweets to db_collection = 'sanders_tweets_raw' # location sanders tweets sanders_tweets_location = os.path.join('files', 'training_tweets', 'sanders', 'sanders_tweets.csv') # read sanders tweets data = read_csv(sanders_tweets_location) # read tweets that have already been processed (if you run for the first time, this will be an empty set) processed_tweets = set([ x['tweet_id'] for x in db.read_collection(collection=db_collection) ]) # loop over each row of the CSV file for i, row in enumerate(data): # verbose logging.info('Processing tweet {}/{}'.format(i + 1, len(data))) # get values from columns tweet_label = row[1] tweet_id = row[2] # check if tweet_id has already been processed if not tweet_id in processed_tweets:
class Evaluation(): def __init__(self): logging.info('Initialized {}'.format(self.__class__.__name__)) # instantiate database self.db = MongoDatabase() def calculate_coherence(self, file_folder=os.path.join('files', 'lda'), models_folder=os.path.join('files', 'models')): """ Calculate the CV coherence score for each of the created LDA models Parameters ---------- file_folder: os.path location of the dictionary and corpus for gensim models_folder: os.path location where the lda model is saved """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read dictionary and corpus dictionary, corpus = get_dic_corpus(file_folder) # load bag of words features of each document from the database texts = [ x['tokens'] for x in self.db.read_collection('publications_raw') ] # get path location for models M = [ x for x in read_directory(models_folder) if x.endswith('lda.model') ] # read processed models from database processed_models = [ '{}-{}-{}-{}-{}'.format(x['k'], x['dir_prior'], x['random_state'], x['num_pass'], x['iteration']) for x in self.db.read_collection('coherence') ] # calculate coherence score for each model for i, m in enumerate(M): logging.info('Calculating coherence score: {}/{}'.format( i + 1, len(M))) print m # number of topics k = m.split(os.sep)[2] # different dirichlet priors dir_prior = m.split(os.sep)[3] # random initiatilizations random_state = m.split(os.sep)[4] # passes over the corpus num_pass = m.split(os.sep)[5] # max iteration for convergence iteration = m.split(os.sep)[6] logging.info( 'k: {}, dir_prior: {}, random_state: {}, num_pass: {}, iteration: {}' .format(k, dir_prior, random_state, num_pass, iteration)) # check if coherence score already obtained if '{}-{}-{}-{}-{}'.format(k, dir_prior, random_state, num_pass, iteration) not in processed_models: # load LDA model model = models.LdaModel.load(m) # get coherence c_v score coherence_c_v = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') # get coherence score score = coherence_c_v.get_coherence() # logging output logging.info('coherence score: {}'.format(score)) # save score to database doc = { 'k': k, 'dir_prior': dir_prior, 'random_state': random_state, 'num_pass': num_pass, 'iteration': iteration, 'coherence_score': score } self.db.insert_one_to_collection('coherence', doc) else: logging.info( 'coherence score already calculated, skipping ...') continue def plot_coherence(self, min_k=2, max_k=20, save_location=os.path.join('files', 'plots'), plot_save_name='coherence_scores_heatmap.pdf'): """ Read coherence scores from database and create heatmap to plot scores Parameters ----------- min_k: int owest number of topics created when creating LDA models. Here 2 max_k: int highest number of topics created when creating LDA models. Here 20 save_location: os.path location where to save the plot plot_save_name: string name for the plot """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # make sure plot save location exists create_directory(save_location) # read documents from database that contain coherence scores D = list(self.db.read_collection(collection='coherence')) # convert data from document into a list data = [[ int(x['k']), x['dir_prior'], x['random_state'], x['num_pass'], x['iteration'], x['coherence_score'] ] for x in D] # create empty dataframe where we can store our scores df = pd.DataFrame() # loop trough values of k parameter and find relevant scores for each grid search combination for k in range(min_k, max_k + 1): # create dataframe to temporarily store values df_temp = pd.DataFrame(index=[k]) # loop trough the data to obtain only the scores for a specific k value for row in sorted(data): if row[0] == k: df_temp['{}-{}-{}-{}'.format( row[1], row[2], row[3], row[4])] = pd.Series(row[5], index=[k]) # append temporarary dataframe of only 1 k value to the full dataframe df = df.append(df_temp) # transpose the dataframe df = df.transpose() # plot the heatmap ax = sns.heatmap(df, cmap="Blues", annot=True, vmin=0.500, vmax=0.530, square=True, annot_kws={"size": 11}, fmt='.3f', linewidths=.5, cbar_kws={'label': 'coherence score'}) # adjust the figure somewhat ax.xaxis.tick_top() plt.yticks(rotation=0) plt.xticks(rotation=0, ha='left') fig = ax.get_figure() fig.set_size_inches(19, 6) # save figure fig.savefig(os.path.join(save_location, plot_save_name), bbox_inches='tight') def output_lda_topics(self, K=9, dir_prior='auto', random_state=42, num_pass=15, iteration=200, top_n_words=10, models_folder=os.path.join('files', 'models'), save_folder=os.path.join('files', 'tables')): """ Create table with LDA topic words and probabilities Creates a table of topic words and probabilties + topics in a list format Values for K, dir_prior, random_state, num_pass and iteratrion will become visible when plotting the coherence score. Use the model that achieved the highest coherence score and plug in the correct values. The values will create the correct file location of the LDA model for example : files/models/2/auto/42/5/200/lda.model Parameters ----------- k: int number of topics that resulted in the best decomposition of the underlying corpora dir_prior: string dirichlet priors 'auto', 'symmetric', 'asymmetric' random_state: int seed value for random initialization num_pass: int number of passes over the full corpus iteration: int max iterations for convergence top_n_words: int only print out the top N high probability words models_folder: os.path location of created LDA models save_folder: os.path location to store the tables """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # load LDA model according to parameters model = load_lda_model( os.path.join(models_folder, str(K), dir_prior, str(random_state), str(num_pass), str(iteration))) # define empty lists so we can fill them with words topic_table, topic_list = [], [] # loop trough all the topics found within K for k in range(K): # create topic header, e.g. (1) TOPIC X topic_table.append([ '{}'.format( get_topic_label(k, labels_available=False).upper()) ]) # add column for word and probability topic_table.append(["word", "prob."]) list_string = "" topic_string = "" topic_string_list = [] # get topic distribution for topic k and return only top-N words scores = model.print_topic(k, top_n_words).split("+") # loop trough each word and probability for score in scores: # extract score and trimm spaces score = score.strip() # split on * split_scores = score.split('*') # get percentage percentage = split_scores[0] # get word word = split_scores[1].strip('"') # add word and percentage to table topic_table.append( [word.upper(), "" + percentage.replace("0.", ".")]) # add word to list table list_string += word + ", " # add empty line for the table topic_table.append([""]) # add topic words to list topic_list.append([str(k + 1), list_string.rstrip(", ")]) # save to CSV save_csv(topic_list, 'topic-list', folder=save_folder) save_csv(topic_table, 'topic-table', folder=save_folder)
class Interpretation(): def __init__(self): logging.info('Initialized {}'.format(self.__class__.__name__)) # instantiate database self.db = MongoDatabase() # location to store plots self.plot_save_folder = os.path.join('files', 'plots') # location to store tables to self.table_save_folder = os.path.join('files', 'tables') def infer_document_topic_distribution( self, K=10, dir_prior='auto', random_state=42, num_pass=15, iteration=200, top_n_words=10, models_folder=os.path.join('files', 'models'), lda_files_folder=os.path.join('files', 'lda')): """ Infer the document topic distribition per publication. The LDA model shows us the word probabilies per topic, but we also want to know what topics we find within each document. Here we infer such document-topic distribution and save it to the databse so we can use it later to plot some interesting views of the corpus Values for K, dir_prior, random_state, num_pass and iteratrion will become visible when plotting the coherence score. Use the model that achieved the highest coherence score. Parameters ----------- k: int number of topics that resulted in the best decomposition of the underlying corpora dir_prior: string dirichlet priors 'auto', 'symmetric', 'asymmetric' random_state: int seed value for random initialization num_pass: int number of passes over the full corpus iteration: int max iterations for convergence top_n_words: int only print out the top N high probability words models_folder: os.path location of created LDA models lda_files_folder: os.path location of LDA corpus and dictionary save_folder: os.path location to store the tables """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read dictionary and corpus dictionary, corpus = get_dic_corpus(lda_files_folder) # load LDA model according to parameters model = load_lda_model( os.path.join(models_folder, str(K), dir_prior, str(random_state), str(num_pass), str(iteration))) # load docs D = self.db.read_collection(collection='publications_raw') # loop through all the documents to infer document-topics distribition for i, d in enumerate(D): # check if tokens are present; in case some documents couldn't properly be tokenized during pre-processing phase if d.get('tokens') is not None: # print to console print_doc_verbose(i, D.count(), d['journal'], d['year'], d['title']) # create bag of words from tokens bow = model.id2word.doc2bow(d['tokens']) # infer document-topic distribution topics = model.get_document_topics(bow, per_word_topics=False) # convert to dictionary: here we convert the topic number to string because mongodb will complain otherwise # you will get a message that documents can only have string keys dic_topics = {} for t in topics: dic_topics[str(t[0])] = float(t[1]) # create a new document to add to the database, this time in a different collection insert_doc = { 'journal': d['journal'], 'year': d['year'], 'title': d['title'], 'topics': dic_topics } # save insert_doc to database within publications collection self.db.insert_one_to_collection('publications', insert_doc) def get_document_title_per_topic(self): """ Get document title per topic Here we obtain the publication title of the most dominant topic within that publication Most dominant topic is the topic proportion that is the largest So if document has topic A = 10%, B = 30%, and C = 60%, then C is the dominant topic We can use the titles for the dominant topics to get insights into the label of that topic """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # load docs D = self.db.read_collection(collection='publications') # empty list where we can append publication titles to titles = [] # loop trough all the docs for i, d in enumerate(D): # print to console print_doc_verbose(i, D.count(), d['journal'], d['year'], d['title']) # get the dominant topic dominant_topic = max(d['topics'].iteritems(), key=itemgetter(1)) # get the topic ID and percentage dominant_topic_id, dominant_topic_percentage = dominant_topic[ 0], dominant_topic[1] # append to list titles.append([ d['year'], d['title'], d['journal'], dominant_topic_id, dominant_topic_percentage ]) # save to CSV save_csv(titles, 'titles-to-topics', folder=self.table_save_folder) def plot_topics_over_time(self, plot_save_name='topics-over-time.pdf'): """ Plot cumulative topic distribution over time Parameters ---------- plot_save_name: string name of the plot """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # load docs D = self.db.read_collection(collection='publications') # create dictionary where we can obtain the topic distribution per year year_to_topics = get_year_to_topics(D) # calculate the cumulative topic distribution: basically the average distribution per year year_to_cum_topics = get_year_to_cum_topics(year_to_topics) # convert dictionary to pandas dataframe df = pd.DataFrame.from_dict(year_to_cum_topics) # create the plot fig, axs = plt.subplots(2, 5, figsize=(15, 10)) axs = axs.ravel() # loop over each row of the dataframe for index, row in df.iterrows(): # get year values x = df.columns.values.tolist() # get topic proportions y = row.tolist() # add to plot axs[index].plot(x, y, 'o--', color='black', linewidth=1, label="Topic prevalence") axs[index].set_title(get_topic_label(index), fontsize=14) axs[index].set_ylim([0, 0.4]) # save plot plt.savefig(os.path.join(self.plot_save_folder, plot_save_name), bbox_inches='tight') plt.close() def plot_topics_over_time_stacked( self, plot_save_name='topics-over-time-stacked.pdf'): """ Plot topics over time stacked Parameters ---------- plot_save_name: string name of the plot """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # load docs D = self.db.read_collection(collection='publications') # create dictionary where we can obtain the topic distribution per year year_to_topics = get_year_to_topics(D) # calculate the cumulative topic distribution: basically the average distribution per year year_to_cum_topics = get_year_to_cum_topics(year_to_topics) # convert dictionary to pandas dataframe df = pd.DataFrame.from_dict(year_to_cum_topics) # transpose dataframe df = df.transpose() # change column headers into topic labels df.columns = [get_topic_label(x) for x in df.columns.values] # plot the dataframe ax = df.plot(figsize=(15, 8), kind='area', colormap='Spectral_r', rot=45, grid=False) # set values for x-axis plt.xticks(df.index) # limit the x-axis plt.xlim(min(df.index), max(df.index)) # limit the y-axis plt.ylim(0, 1) # get the legend handles, labels = ax.get_legend_handles_labels() # position it right of the figure plt.legend(reversed(handles), reversed(labels), loc='right', bbox_to_anchor=(1.35, 0.50), ncol=1, fancybox=False, shadow=False, fontsize=16) # save plot plt.savefig(os.path.join(self.plot_save_folder, plot_save_name), bbox_inches='tight') plt.close() def plot_topic_co_occurrence(self, plot_save_name='topic-co-occurrence.pdf'): """ Plot topic co-occurrence Parameters ---------- plot_save_name: string name of the plot """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # load docs D = self.db.read_collection(collection='publications') # create empty dictionary where we can store the dominant topic id and remaining other proportions dominant_id_to_topics = {} for d in D: # sort topics and create list topics = [ value for key, value in sorted(d['topics'].iteritems(), key=lambda x: int(x[0])) ] # get max topix id max_topic_id = topics.index(max(topics)) # check if topic ID key already created if max_topic_id not in dominant_id_to_topics: dominant_id_to_topics[max_topic_id] = [] dominant_id_to_topics[max_topic_id].append(topics) # create empty dictionary where we can have the cumulative topic distribution per dominant topic ID dominant_id_to_cum_topics = {} for k, v in dominant_id_to_topics.iteritems(): # calculate mean and add to dictionary dominant_id_to_cum_topics[k] = np.mean(np.array(v), axis=0) * 100. # convert dictionary to pandas dataframe df = pd.DataFrame.from_dict(dominant_id_to_cum_topics) # change column headers into topic labels df.columns = [get_topic_label(x) for x in df.columns.values] df.index = [get_topic_label(x) for x in df.index.values] # create max column df['max'] = 0. # keep track of new index new_index = [] # add max column so we can sort on it later for index, row in df.iterrows(): # add value to max column df['max'][index] = max(row) # make self co-occurrence zero df[index][index] = 0.0 # add new index names to tracker so we can rename it later new_index.append('{} ({}%)'.format(index, round(max(row), 2))) # update index name df.index = new_index # sort by max column df = df.sort_values(by=['max'], ascending=False) # delete max column df = df.drop(['max'], axis=1) # sort based on column totals df = df.reindex(sorted(df.columns), axis=1) # plot the heatmap ax = sns.heatmap(df, cmap="Blues", annot=True, vmin=0., vmax=10., square=True, annot_kws={"size": 11}, fmt='.1f', mask=df <= 0.0, linewidths=.5, cbar=False, yticklabels=True) # adjust the figure somewhat ax.xaxis.tick_top() plt.yticks(rotation=0) plt.xticks(rotation=90, ha='left') fig = ax.get_figure() fig.set_size_inches(19, 6) # save figure fig.savefig(os.path.join(self.plot_save_folder, plot_save_name), bbox_inches='tight') def plot_topics_in_journals(self, plot_save_name='topics-in-journals.pdf'): """ Plot the distribution of topics within each of the journals in our dataset. This plot provides an overview of the topical content published by a journal given the time frame of our dataset Parameters ---------- plot_save_name: string name of the plot """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # create dictionary where we have key = journal, and value = [topic_distributions] journal_to_topics = {} # load documents from database D = self.db.read_collection(collection='publications') # loop over the documents, read in the topic distribution, and add to the correct journal key for i, d in enumerate(D): # verbose process every 1000th document if i % 1000 == 0: logging.debug('Processing document {}/{}'.format(i, D.count())) # get the name of the journal journal = d['journal'] # check if topics are created if d.get('topics') is not None: # add journal as key to the dictionary if not already exists if journal not in journal_to_topics: # add journal as key with empty list journal_to_topics[journal] = [] # sort topics and create as list topics = [ value for key, value in sorted(d['topics'].iteritems(), key=lambda x: int(x[0])) ] # append topic distribution to dictionary journal_to_topics[journal].append(topics) # get cumulative topic distributions for each journa journal_to_cum_topics = get_journal_to_cum_topics(journal_to_topics) # convert to Pandas DataFrame df = pd.DataFrame.from_dict(journal_to_cum_topics).T # change column labels to topic labels df.columns = [get_topic_label(x) for x in df.columns.values] # plot the heatmap ax = sns.heatmap(df, cmap="Blues", annot=True, vmin=0., vmax=.3, square=True, annot_kws={"size": 11}, fmt='.2f', mask=df <= 0.0, linewidths=.5, cbar=False, yticklabels=True) # adjust the figure somewhat ax.xaxis.tick_top() plt.yticks(rotation=0) plt.xticks(rotation=90, ha='left') fig = ax.get_figure() fig.set_size_inches(10, 10) # save figure fig.savefig(os.path.join(self.plot_save_folder, plot_save_name), bbox_inches='tight') # close thee plot plt.close()
# adjust somewhat plt.subplots_adjust(wspace=0.0, hspace=0.0) # remove some white space plt.tight_layout() # save figure fig.savefig(os.path.join(plot_location, 'donutplot.pdf')) # close plot so we can plot again if necessary plt.close() if create_time_stacked_bar_plot: """ Create a stacked bar chart that shows the sentiment over time, each bar shows the positive, negative and neutral tweets """ # get target tweet documents from database D = db.read_collection(collection='target_tweets') # create dictionary of week numbers per tweet id dic_weeks = {} for d in D: dic_weeks[d['tweet_id']] = '{}-{}'.format( d['tweet_date'].year, str(d['tweet_date'].isocalendar()[1]).zfill(2)) # get list of year + week weeks = sorted(set([x for x in dic_weeks.values()])) # some tweets from a part of week 31 were obtained because the api looks at 7 days history but we don't need them because we only want full weeks # weeks.remove('2017-31') # create the figure environment so we can plot the barcharts
class Preprocessing(): def __init__(self): logging.info('Initialized {}'.format(self.__class__.__name__)) # instantiate database self.db = MongoDatabase() # set utf8 encoding reload(sys) sys.setdefaultencoding('utf8') def full_text_preprocessing(self, pdf_folder=os.path.join('files', 'pdf')): """ preprocess full-text publications - convert pdf to plain text - correct for carriage returns - correct for end-of-line hyphenation - remove boilerplate - remove bibliography - remove acknowledgements Parameters ---------- pdf_folder : os.path location where PDF documents are stored """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read pdf files that need to be converted F = [x for x in read_directory(pdf_folder) if x[-4:] == '.pdf'] # read documents from DB that have already been processed so we can skip them processed_documents = [ '{}-{}-{}'.format(x['journal'], x['year'], x['title']) for x in self.db.read_collection(collection='publications_raw') ] # loop over each file and convert pdf to plain and save meta data to DB for i, f in enumerate(F): # extract meta data from folder structure and file name journal = f.split('/')[2] year = f.split('/')[3] title = f.split('/')[4].replace('-', ' ')[4:-4].strip() # console output print_doc_verbose(i, len(F), journal, year, title) # check if PDF has already been processed if '{}-{}-{}'.format(journal, year, title) in processed_documents: logging.info('PDF document already processed, skipping ...') continue # convert content of PDF to plain text content = pdf_to_plain(f) # check if content could be extracted if content is not None: # fix soft hyphen content = content.replace(u'\xad', "-") # fix em-dash content = content.replace(u'\u2014', "-") # fix en-dash content = content.replace(u'\u2013', "-") # minus sign content = content.replace(u'\u2212', "-") # fix hyphenation that occur just before a new line content = content.replace('-\n', '') # remove new lines/carriage returns content = content.replace('\n', ' ') # correct for ligatures content = content.replace(u'\ufb02', "fl") # fl ligature content = content.replace(u'\ufb01', "fi") # fi ligature content = content.replace(u'\ufb00', "ff") # ff ligature content = content.replace(u'\ufb03', "ffi") # ffi ligature content = content.replace(u'\ufb04', "ffl") # ffl ligature """ Remove boilerplate content: Especially journal publications have lots of boilerplate content on the titlepage. Removing of this is specific for each journal and you can use some regular expressions to identify and remove it. """ """ Remove acknowledgemends and/or references This is a somewhat crude example """ if content.rfind("References") > 0: content = content[:content.rfind("References")] """ Remove acknowledgements """ if content.rfind("Acknowledgment") > 0: content = content[:content.rfind("Acknowledgment")] # prepare dictionary to save into MongoDB doc = { 'journal': journal, 'title': title, 'year': year, 'content': content } # save to database self.db.insert_one_to_collection(doc=doc, collection='publications_raw') def general_preprocessing(self, min_bigram_count=5): """ General preprocessing of publications (used for abstracts and full-text) Parameters ---------- min_bigram_count : int (optional) frequency of bigram to occur to include into list of bigrams. Thus lower frequency than min_bigram_count will not be included. """ logging.info('Start {}'.format(sys._getframe().f_code.co_name)) # read document collection D = self.db.read_collection(collection='publications_raw') # setup spacy natural language processing object nlp = setup_spacy() # loop through the documents and correct content for i, d in enumerate(D): # check if tokens are already present, if so, skip if d.get('tokens') is None: # print to console print_doc_verbose(i, D.count(), d['journal'], d['year'], d['title']) # get content from document and convert to spacy object content = nlp(d['content']) # tokenize, lemmatization, remove punctuation, remove single character words unigrams = word_tokenizer(content) # get entities entities = named_entity_recognition(content) # get bigrams bigrams = get_bigrams(" ".join(unigrams)) bigrams = [['{} {}'.format(x[0], x[1])] * y for x, y in Counter(bigrams).most_common() if y >= min_bigram_count] bigrams = list(itertools.chain(*bigrams)) d['tokens'] = unigrams + bigrams + entities # save dictionary to datbase self.db.update_collection(collection='publications_raw', doc=d) else: logging.debug('Document already tokenized, skipping ...')
# location of target tweets location_tweets = os.path.join('files', 'target_tweets') # modes of research modes_of_research = ['interdisciplinary', 'multidisciplinary','transdisciplinary'] # process tweets for each mode of research for mode in modes_of_research: logging.info('Processing mode of research: {}'.format(mode)) # read tweets files F = read_directory(os.path.join(location_tweets, mode)) # tracker to keep track of processed tweet ids tweet_tracker = set(['{}{}'.format(x['tweet_type'], x['id']) for x in db.read_collection(collection = 'raw_tweets')]) # loop over each file, read content, parse relevant fields, save to db for i, f in enumerate(F): logging.info('Processing file {} {}/{}'.format(f, i + 1, len(F))) # read tweets from file (as list) tweets = read_plain_text(f, read_lines = True) # loop over each tweet for tweet in tweets: # convert string to json tweet = json.loads(tweet)
analysis}. The dataset is available from http://www.sananalytics.com/lab/. Tweets are saved into the collection 'sanders_tweets_raw' """ # name of the collection to store tweets to db_collection = 'sanders_tweets_raw' # location sanders tweets sanders_tweets_location = os.path.join('files', 'training_tweets', 'sanders', 'sanders_tweets.csv') # read sanders tweets data = read_csv(sanders_tweets_location) # read tweets that have already been processed (if you run for the first time, this will be an empty set) processed_tweets = set([x['tweet_id'] for x in db.read_collection(collection = db_collection)]) # loop over each row of the CSV file for i, row in enumerate(data): # verbose logging.info('Processing tweet {}/{}'.format(i + 1, len(data))) # get values from columns tweet_label = row[1] tweet_id = row[2] # check if tweet_id has already been processed if not tweet_id in processed_tweets: # get content of the tweet
# create logging to console set_logger() # verbose logging.info('Start: {} '.format(__file__)) # create database connection db = MongoDatabase() # load classifier clf = joblib.load(os.path.join('files', 'ml_models3', 'LinearSVC.pkl')) # read labels for target tweets that have been manually labeled and convert to dictionary with key = tweet ID and value = label true_labels = { d['tweet_id']: d['label'] for d in db.read_collection(collection='sanders_tweets_raw') } # load tweets for which we want to infer the sentiment label D = db.read_collection(collection='target_tweets_UNCW') # create empty numpy array so we can retrieve labels later on somewhat faster labels = np.zeros((D.count(), 3), dtype=np.int64) # loop over each target tweet for i, d in enumerate(D): logging.debug(' - Processing tweet {}/{}'.format(i + 1, D.count())) # check if we have a true label for the target tweet, if so, skip prediction and use true label if d['tweet_id'] in true_labels: