import os, sys, time import glob import types import ipywidgets as widgets import text_corpus import domain_logic_vatican as domain_logic import nltk import pandas as pd import zipfile sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path import common.widgets_config as widgets_config import common.utility as utility logger = utility.getLogger('corpus_text_analysis') from nltk.parse import corenlp STANFORD_CORE_NLP_URL = 'http://localhost:9000' def merge_entities(entities): n_entities = len(entities) if n_entities <= 1: return entities merged = entities[:1] for doc_id, i_n, w_n, t_n in entities[1:]: doc_id_p, i_p, w_p, t_p = merged[-1] if i_n == i_p + 1 and t_n == t_p: merged[-1] = (doc_id, i_n, '_'.join([w_p, w_n]), t_p)
import os import shutil import pandas as pd import glob import time import zipfile from common.utility import getLogger logger = getLogger() class FileUtility: def __init__(self, directory): self.directory = directory def create(self, clear_target_dir=False): if os.path.exists(self.directory) and clear_target_dir: shutil.rmtree(self.directory) if not os.path.exists(self.directory): os.makedirs(self.directory) return self @staticmethod def read_excel(filename, sheet): if not os.path.isfile(filename): raise Exception("File {0} does not exist!".format(filename)) with pd.ExcelFile(filename) as xls:
from IPython.display import display import pandas as pd import nltk os.sys.path = os.sys.path if '..' in os.sys.path else os.sys.path + ['..'] import common.widgets_config as widgets_config import common.config as config import common.color_utility as color_utility import common.utility as utility import headnote_corpus from pprint import pprint as pp logger = utility.getLogger(name='title_analysis') OUTPUT_OPTIONS = { 'Table': 'table', 'Table, grid': 'qgrid', 'Table, unstacked': 'unstack', 'Plot bar': 'plot_bar', 'Plot stacked bar': 'plot_stacked_bar', 'Plot line': 'plot_line', 'Plot area': 'plot_area', 'Plot stacked area': 'plot_stacked_area' } EXTRA_GROUPBY_OPTIONS = { '': None, 'Topic': [ 'topic_category' ],
import ipywidgets as widgets import pandas as pd import common.config as config import common.utility as utility import common.treaty_utility as treaty_utility import common.widgets_config as widgets_config import common.color_utility as color_utility import analysis_data import analysis_plot from IPython.display import display from pprint import pprint as pp logger = utility.getLogger('tq_by_topic') def display_topic_quantity(period_group=0, topic_group=None, party_group=None, recode_is_cultural=False, normalize_values=False, extra_other_category=False, chart_type_name=None, plot_style='classic', target_quantity="topic", wti_index=None, progress=utility.noop): try: # print(locals()) progress()
from IPython.display import display os.sys.path = os.sys.path if '..' in os.sys.path else os.sys.path + ['..'] import common.config as config import common.widgets_config as widgets_config import common.color_utility as color_utility import common.utility as utility from common.network.layout import layout_setups, layout_network from common.network.networkx_utility import create_nx_subgraph, get_positioned_edges2, get_positioned_nodes from network_analysis_plot import plot_network, get_palette from network_analysis import create_party_network, slice_network_datasource, setup_node_size, adjust_node_label_offset NETWORK_LAYOUT_OPTIONS = {x.name: x.key for x in layout_setups} logger = utility.getLogger('network_analysis') warnings.filterwarnings('ignore') NETWORK_PLOT_OPTS = dict( x_axis_type=None, y_axis_type=None, background_fill_color='white', line_opts=dict(color='green', alpha=0.5), node_opts=dict(color=None, level='overlay', alpha=1.0), ) NODE_SIZE_OPTIONS = { '(default)': None, 'Degree centrality': 'degree', 'Closeness centrality': 'closeness', 'Betweenness centrality': 'betweenness',
import collections import pandas as pd from corpora.corpus_source_reader import SparvCorpusSourceReader from corpora.zip_utility import ZipReader from common.utility import getLogger, extend logger = getLogger(__name__) KindOfPoS = collections.namedtuple('KindOfPoS', 'tag description is_deliminator') SUC_POS_TAGS = { 'AB': KindOfPoS(tag='AB', description={ 'en': 'Adverb', 'se': 'Adverb' }, is_deliminator=False), 'DT': KindOfPoS(tag='DT', description={ 'en': 'Determiner', 'se': 'Determinerare, bestämningsord' }, is_deliminator=False), 'HA': KindOfPoS(tag='HA', description={ 'en': 'Interrogative/Relative Adverb',
import ipywidgets as widgets import itertools import types import pandas as pd import common.widgets_config as widgets_config import common.config as config import common.utility as utility import common.treaty_utility as treaty_utility import common.color_utility as color_utility import analysis_data import analysis_plot from pprint import pprint as pp from IPython.display import display logger = utility.getLogger('tq_by_party') OTHER_CATEGORY_OPTIONS = { 'Other category': 'other_category', 'All category': 'all_category', 'Nothing': '' } def display_quantity_by_party(period_group_index=0, party_name='', parties=None, year_limit=None, treaty_filter='', extra_category='', normalize_values=False, chart_type_name=None,
from geopy.geocoders import GoogleV3# GeoNames, Nominatim, GoogleV3 # if explicit use of geopy from . geocode_loc_tags import assign_geocodes, load_swener_tags, get_country import numpy as np import pandas as pd import common.file_utility as file_utility import common.utility as utility logger = utility.getLogger(__name__) def setup_unique_locations_dataframe(df_tags, geocoded_filename): df_locations = df_tags.loc[df_tags.category.str.contains('LOC'),['year', 'entity']] df = df_locations['entity'].drop_duplicates().to_frame() df['processed'] = np.nan df['latitude'] = np.nan df['longitude'] = np.nan df['reversename'] = np.nan df['country'] = np.nan df = df.set_index('entity') df_geocoded = file_utility.FileUtility.read_excel(filename=geocoded_filename, sheet='Sheet1').set_index('entity') return df.combine_first(df_geocoded) def assign_country_to_locations(df): country_info = df['reversename'].map(lambda x: get_country(str(x))) df['country'] = country_info.map(lambda x: x.name if not x is None else None) df['country_code'] = country_info.map(lambda x: x.alpha_2 if not x is None else None) df['country_code3'] = country_info.map(lambda x: x.alpha_3 if not x is None else None) def process_geocoding(df_tags, geolocator, geocoded_filename, geocoded_output_filename):
# -*- coding: utf-8 -*- import os import pandas as pd from common.file_utility import FileUtility import common.utility as utility join = os.path.join logger = utility.getLogger('NotebookDataGenerator') class NotebookDataGenerator(): """Class that prepares and extracts various data from LDA model. Main purpose is to prepare data for Jupyter notebooks """ def __init__(self, store): self.store = store def _compile_dictionary(self, lda): logger.info('Compiling dictionary...') token_ids, tokens = list(zip(*lda.id2word.items())) dfs = lda.id2word.dfs.values() if lda.id2word.dfs is not None else [0] * len(tokens) dictionary = pd.DataFrame({ 'token_id': token_ids, 'token': tokens, 'dfs': list(dfs) }).set_index('token_id')[['token', 'dfs']] return dictionary def __compile_document_topics_iter(self, lda, mm, minimum_probability):
import ipywidgets as widgets import pandas as pd import common.config as config import common.utility as utility import common.treaty_utility as treaty_utility import common.widgets_config as widgets_config import common.color_utility as color_utility import analysis_data import analysis_plot import logging import types from IPython.display import display from pprint import pprint as pp logger = utility.getLogger('tq_by_topic', level=logging.WARNING) def display_topic_quantity(period_group=0, topic_group=None, party_group=None, recode_is_cultural=False, normalize_values=False, extra_other_category=False, chart_type_name=None, plot_style='classic', target_quantity="topic", treaty_sources=None, wti_index=None, progress=utility.noop): try: