Пример #1
0
def parse_for_lda(nrows=None):
    """
    Loads the data from CSV and performs some basic cleaning. Essentially the
    cleaning removes corrupted lines.
    """
    # Load the data
    df, _ = import_csv()

    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df['year'] = df['date'].dt.year
    df = df[df['year'] > 1999]

    #drop irrelevant section names
    start_n = df.shape[0]
    df = df[~df['section_name'].isin([
        'Biler', 'Bolig-besøg', 'Bolig-indret', 'Bolig-items', 'Cannes',
        'Cannes Lions', 'Design', 'Digital Gadget', 'Gastronomi', 'Karriere',
        'Kriminal', 'Kultur', 'Livsstil', 'Magasin Pleasure',
        'Magasin Pleasure 2. sektion', 'Magasin Pleasure 2. sektion Rejser',
        'Magasin Pleasure 2. sektion Rejser Hoteller Stil',
        'Magasin Pleasure Biler', 'Magasin Pleasure Design',
        'Magasin Pleasure EM', 'Magasin Pleasure Firmabilen 2015',
        'Magasin Pleasure Interiør', 'Magasin Pleasure kunst & kultur',
        'Magasin Pleasure Portræt', 'Magasin Pleasure rejser',
        'Magasin Pleasure Ure', 'Michelin', 'Motion', 'Play 2016', 'Pleasure',
        'Portræt', 'Profil & Karriere', 'Underholdning', 'Week-div',
        'Week-golf', 'Week-livsstil', 'Week-mad', 'Week-maritim', 'Week-mode',
        'Week-motor', 'Week-rejser', 'Weekend Diverse', 'Weekend Golf',
        'Weekend Kultur', 'Weekend Livsstil', 'Weekend Livstil', 'Weekend Mad',
        'Weekend Maritim', 'Weekend Mode', 'Weekend Motor', 'Weekend Outdoor',
        'Weekend Rejser'
    ])]
    end_n = df.shape[0]
    print('Dropped {} articles with irrelevant section names'.format(start_n -
                                                                     end_n))
    print(f'Current number of articles: {end_n}')

    #drop word count below 50
    df['word_count'] = df['body'].str.count(' ') + 1
    start_n = df.shape[0]
    df = df[df.word_count >= 50]
    end_n = df.shape[0]
    print('Dropped {} articles with less than 50 words'.format(start_n -
                                                               end_n))
    print(f'Current number of articles: {end_n}')

    df['body'] = __clean_text(df['body'])

    # create unique row index
    df['article_id'] = df.reset_index().index
    print('Columns: ', df.columns)

    with h5py.File(
            os.path.join(params().paths['parsed_news'],
                         params().filenames['parsed_news']), 'w') as hf:
        string_dt = h5py.string_dtype(encoding='utf-8')
        hf.create_dataset('parsed_strings', data=df, dtype=string_dt)
Пример #2
0
def create_corpus(lda_instance):

    # Helper-class to create BoW-corpus "lazily"
    class CorpusSplitter:
        def __init__(self, test_share):
            self.test_share = test_share
            self.test_corpus = []

        def __iter__(self):
            for line in lda_instance.articles:
                if random.random() <= self.test_share:
                    self.test_corpus.append(
                        lda_instance.dictionary.doc2bow(
                            lda_instance.bigram_phraser[line.split()]))
                    continue
                else:
                    yield lda_instance.dictionary.doc2bow(
                        lda_instance.bigram_phraser[line.split()])

    # Serialize corpus using either BoW of tf-idf
    corpus_bow = CorpusSplitter(lda_instance.test_share)

    file_path = os.path.join(params().paths['lda'], 'corpus.mm')
    file_path_test = os.path.join(params().paths['lda'], 'corpus_test.mm')
    try:
        lda_instance.TrainCorpus = gensim.corpora.MmCorpus(file_path)
        if lda_instance.test_share > 0.0:
            lda_instance.TestCorpus = gensim.corpora.MmCorpus(file_path_test)
        print("Loaded pre-existing corpus")
    except FileNotFoundError:
        print("Corpus not found, creating from scratch")
        if not hasattr(lda_instance, 'bigram_phraser'):
            lda_instance.load_bigrams()

        # Serialize corpus (either BoW or tf-idf)
        if not params().options['lda']['tf-idf']:
            print("\tSerializing corpus, BoW")
            gensim.corpora.MmCorpus.serialize(file_path, corpus_bow)
            if lda_instance.test_share > 0.0:
                gensim.corpora.MmCorpus.serialize(file_path_test,
                                                  corpus_bow.test_corpus)
        else:
            print("\tSerializing corpus, tf-idf")
            tfidf = gensim.models.TfidfModel(corpus_bow)
            train_corpus_tfidf = tfidf[corpus_bow]
            gensim.corpora.MmCorpus.serialize(file_path, train_corpus_tfidf)
            if lda_instance.test_share > 0.0:
                tfidf = gensim.models.TfidfModel(corpus_bow.test_corpus)
                test_corpus_tfidf = tfidf[corpus_bow.test_corpus]
                gensim.corpora.MmCorpus.serialize(file_path_test,
                                                  test_corpus_tfidf)

        lda_instance.TrainCorpus = gensim.corpora.MmCorpus(file_path)
        if lda_instance.test_share > 0.0:
            lda_instance.TestCorpus = gensim.corpora.MmCorpus(file_path_test)
Пример #3
0
def load_model(lda_instance, num_topics):
    try:
        folder_path = os.path.join(params().paths['root'],
                                   params().paths['lda'],
                                   'lda_model_' + str(num_topics))
        file_path = os.path.join(folder_path, 'trained_lda')
        lda_instance.lda_model = gensim.models.LdaMulticore.load(file_path)
        print("LDA-model with {} topics loaded".format(num_topics))
    except FileNotFoundError:
        print("Error: LDA-model not found")
        lda_instance.lda_model = None
Пример #4
0
 def load_processed_text(self):
     try:
         with h5py.File(os.path.join(params().paths['lda'], params().filenames['lda_cleaned_text']), 'r') as hf:
             print("Loading processed data from HDF-file")
             hf = hf['parsed_strings'][:]
             self.article_id = list(zip(*hf))[0]
             self.articles = list(zip(*hf))[1]
             print("\t{} documents loaded".format(len(self.articles)))
         return 1
     except OSError:
         return 0
Пример #5
0
def _load_u_count(sample_size=0,extend=True):
    if extend:
        filename = params().filenames['parsed_news_uc_ext']
    else:
        filename = params().filenames['parsed_news_uc']

    file_path = os.path.join(params().paths['enriched_news'],filename)
    df = read_h5py(file_path)
    df = df[['article_id', 'body', 'u_count', 'n_count', 'word_count']]
    if sample_size > 0:
        return df.sample(sample_size)
    else:
        return df
Пример #6
0
def merge_unseen_docs(lda_instance, start_date=None, end_date=None):
    articles = read_h5py(
        os.path.join(params().paths['parsed_news'],
                     params().filenames['parsed_news']))

    #subset to date range
    if start_date is not None and end_date is not None:
        articles = articles.loc[(articles.date >= start_date)
                                & (articles.date < end_date)]
        filename = params(
        ).filenames['lda_merge_doc_topics_file'] + '_' + start_date.strftime(
            "%Y-%m-%d") + '_' + end_date.strftime("%Y-%m-%d")
    else:
        filename = params().filenames['lda_merge_doc_topics_file']

    texts = list(zip(articles['article_id'].values, articles['body'].values))

    # Find document-topics for the document-intersection above
    with Pool(6) as pool:
        document_topics = pool.map(
            partial(lda_instance.get_topics, lda_instance.lda_model,
                    lda_instance.dictionary), [i[1] for i in texts])

    df_lda = pd.DataFrame({
        'article_id': [i[0] for i in texts],
        'topics': [[x[1] for x in document_topics[i]]
                   for i in range(len(document_topics))]
    })

    df_lda = pd.merge(df_lda,
                      articles[['article_id', 'headline', 'date']],
                      how='inner',
                      on='article_id')
    print(df_lda.columns.to_list())

    folder_path = params().paths['doc_topics']
    topics_path = os.path.join(folder_path, filename)

    df2 = pd.DataFrame(df_lda.topics.values.tolist(), index=df_lda.index)
    df_enriched_lda = pd.concat(
        [df2, df_lda[['article_id', 'headline', 'date']]], axis=1)
    print(df_enriched_lda.columns.to_list())
    del (df2)
    df_enriched_lda.to_hdf(topics_path,
                           'table',
                           format='table',
                           mode='w',
                           append=False)
Пример #7
0
    def build(self, logic, bloom_dict_name,
              start_year=2000, end_year=2020, frq='Q', u_weight=False, extend=True):
        """
        Finds for articles containing words in bloom dictionary. Saves result to disk.
        args:
        dict_name: name of bloom dict in params
        logic: matching criteria in params
        """
        self.start_year = start_year
        self.end_year = end_year
        self.logic = logic
        self.frq = frq
        if self.end_year == 2020:
            self.end_str = '-06-30'
        else:
            self.end_str = ''
        
        out_path = params().paths['indices']+self.name+'\\'+self.logic
        if not os.path.exists(out_path):
            os.makedirs(out_path)        

        if extend:
            bloom_dict = extend_dict_w2v(bloom_dict_name, n_words=10)
            df = read_h5py(os.path.join(params().paths['enriched_news'],
                                       params().filenames['parsed_news_uc_ext']))
        else:
            bloom_dict = params().dicts[bloom_dict_name]
            df = read_h5py(os.path.join(params().paths['parsed_news'],
                                        params().filenames['parsed_news']))

        b_E, b_P, b_U = _get_bloom_sets(bloom_dict)
        print('\n\nEconomic words: ' + repr(b_E) +
              '\n\n Political words: ' + repr(b_P) +
              '\n\n Uncertainty words: ' + repr(b_U))
        
        #stem articles
        with Pool() as pool:
            df['body_stemmed'] = pool.map(_stemtext, 
                                          df['body'].values.tolist())
        if u_weight:
            logic_str = params().options['bloom_logic_weighted']
        else:
            logic_str = params().options['bloom_logic'][self.logic]
            
        print('\nLogic: '+logic_str)
        #compare to dictionary
        with Pool() as pool:
            df['idx'] = pool.map(partial(_bloom_compare, 
                                         logic=logic_str, 
                                         bloom_E=b_E, 
                                         bloom_P=b_P, 
                                         bloom_U=b_U), 
                                         df['body_stemmed'].values.tolist())
        if u_weight:
            df['idx'] = df['idx']*((df['n_count']+df['u_count'])/df['word_count'])

        self.idx = self.aggregate(df, norm=True, lda=False)
        return self.idx
Пример #8
0
    def __init__(self,
                 num_topics,
                 metric='jensenshannon',
                 method='ward',
                 unique_scale=True,
                 topn=None):
        """
        Saves linkage matrix `Z´ and `nodelist´
        args:
            num_topics (int): Selects LDA model.
            metric (str): Metric passed to scipy.spatial.distance.pdist 
            method (str): Method passed to scipy.cluster.hierarchy
            unique_scale (bool): Scale word proba by uniqueness
            topn (int, optional): only consider X words (don't use)
        """

        self.num_topics = num_topics
        self.metric = metric
        self.method = method
        self.scale = 200

        folder_path = os.path.join(params().paths['lda'],
                                   'lda_model_' + str(self.num_topics))
        file_path = os.path.join(folder_path, 'trained_lda')
        self.lda_model = gensim.models.LdaMulticore.load(file_path)
        topics = self.lda_model.get_topics()
        if unique_scale:
            topics = topics / (topics.sum(axis=0))
        if topn:
            topics.sort(axis=1)
            topics = np.flip(topics, axis=1)
            topics = topics[:, 0:topn]
        y = pdist(topics, metric=self.metric)
        self.Z = hierarchy.linkage(y, method=self.method)
        rootnode, self.nodelist = hierarchy.to_tree(self.Z, rd=True)
Пример #9
0
    def aggregate(self, df, col='idx', norm=True, write_csv=True, method='mean', lda=True):
        """
        Aggregates to means within 
        each aggregation frequency
        args:
        df (DataFrame): input data
        col (str): column to aggregate
        norm (bool): add column of normalized values
        write_csv (bool): write result as csv.
        method (str): agg_func, 'mean' or 'sum'
        returns:
        DataFrame of aggregation result with datetime index.
        """     
        cols = [col, 'date']
        if lda:
            cols.extend(self.topics)

        df.set_index('date', inplace=True, drop=False)
        idx = df[cols].groupby(
            [pd.Grouper(key='date', freq=self.frq)]
        ).agg([method])

        if norm:
            scaler = StandardScaler()
            idx = pd.DataFrame(scaler.fit_transform(idx), columns=idx.columns, index=idx.index)

        idx = idx[str(self.start_year):str(self.end_year)+self.end_str]
        idx.columns = idx.columns.get_level_values(0)
        print("Last month: ", idx[-1:])
        #idx.to_pickle(params().paths['indices']+self.name+'_'+self.frq+'.pkl')
        if write_csv:
            dump_csv(params().paths['indices'], self.name+'_'+self.frq, idx.iloc[:,0], verbose=False)
        return idx
Пример #10
0
def save_topics_to_hdf(df,suffix):
    df2 = pd.DataFrame(df.topics.values.tolist(), index = df.index)
    print(df.dtypes)
    df = pd.concat([df2, df[['article_id', 'date', 'u_count', 'n_count', 'word_count']]], axis=1)
    df['u_count'] = df['u_count'].astype(np.int64)
    df['n_count'] = df['n_count'].astype(np.int64)
    print(df.dtypes)
    del(df2)
    df.to_hdf(params().paths['doc_topics']+'doc_topics_'+suffix+'.h5', 'table', format='table', mode='w', append=False)
Пример #11
0
    def _get_topic_sums(self):
        """Get sum of topic probabilities across articles.
        """
        df = pd.read_hdf(
            params().paths['doc_topics'] + 'doc_topics_u_count_extend.h5',
            'table')
        df = df.iloc[:, 0:self.num_topics].values.tolist()

        self.topic_sums = np.array(df).sum(axis=0)
Пример #12
0
 def __init__(self, lemmatizer, test_share=0.05, test=False):
     self.dictionary = None
     self.articles = []
     self.article_id = []
     self.SerializedCorpus = None
     self.test = test        
     self.lemmatizer = lemmatizer
     self.test_share = test_share
             
     #if params().options['lda']['log']:
     import logging
     try:
         os.remove(params().paths['lda']+'lda_log.txt')
     except (FileNotFoundError, PermissionError):
         pass
     logging.basicConfig(filename=params().paths['lda']+'lda_log.txt',
                         format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     logger = logging.getLogger(__name__)
     warnings.filterwarnings('ignore', category=DeprecationWarning)
Пример #13
0
    def parse_topic_labels(self, name):
        """
        reads hand labeled topics from json file.
        
        """
        label_path = os.path.join(params().paths['topic_labels'],
                                  name + str(self.num_topics) + '.json')

        with codecs.open(label_path, 'r', encoding='utf-8-sig') as f:
            self.labels = json.load(f)
        return self.labels
Пример #14
0
def load_models(lda_instance, topics, plot=False):
    lda_models = []
    file_list = []
    for t in topics:
        print(t)
        file_list.append(
            os.path.join(params().paths['root'],
                         params().paths['lda'],
                         'lda_model_' + str(t) + '\\trained_lda'))

    for f in file_list:
        print(f)
        try:
            lda_model_n = gensim.models.LdaMulticore.load(f)
            lda_models.append(lda_model_n)

        except FileNotFoundError:
            print(f"Error: LDA-model at {f} not found")

    lda_instance.lda_models = lda_models
Пример #15
0
def load_parsed_data(sample_size=None):
    filelist = glob.glob(params().paths['parsed_news'] + 'boersen*.pkl')
    df = pd.DataFrame()
    for f in filelist:
        with open(f, 'rb') as f_in:
            df_n = pickle.load(f_in)
            df = df.append(df_n)
    if sample_size is not None:
        return df.sample(sample_size)
    else:
        return df
Пример #16
0
def get_perplexity(lda_model, lda_instance, chunksize=2000):
    file_path_test = os.path.join(params().paths['lda'], 'corpus_test.mm')
    mm = gensim.corpora.mmcorpus.MmCorpus(
        file_path_test)  # `mm` document stream now has random access
    sample = [random.randint(0, mm.num_docs) for i in range(chunksize)]
    test_corpus = []
    for doc in sample:
        test_corpus.append(mm[doc])
    perplexity = np.exp2(
        -lda_model.log_perplexity(test_corpus, len(lda_instance.articles)))
    return perplexity
Пример #17
0
def create_dictionary(lda_instance,
                      load_bigrams=True,
                      unwanted_words=None,
                      keep_words=None):

    # Clean and write texts to HDF
    if not lda_instance.load_processed_text():
        lda_instance.load_and_clean_body_text()

    # Create dictionary (id2word)
    file_path = os.path.join(params().paths['lda'],
                             params().filenames['lda_dictionary'])

    # Load bigram phraser
    if load_bigrams:
        lda_instance.load_bigrams()

    try:
        lda_instance.dictionary = gensim.corpora.Dictionary.load(file_path)
        print("Loaded pre-existing dictionary")
    except FileNotFoundError:
        print("Dictionary not found, creating from scratch")

        lda_instance.dictionary = gensim.corpora.Dictionary(
            articles for articles in lda_instance)

        lda_instance.dictionary.filter_extremes(
            no_below=params().options['lda']['no_below'],
            no_above=params().options['lda']['no_above'],
            keep_n=params().options['lda']['keep_n'],
            keep_tokens=keep_words)
        if unwanted_words is None:
            unwanted_words = []
        unwanted_ids = [
            k for k, v in lda_instance.dictionary.items()
            if v in unwanted_words
        ]
        lda_instance.dictionary.filter_tokens(bad_ids=unwanted_ids)
        lda_instance.dictionary.compactify()
        lda_instance.dictionary.save(file_path)
    print("\t{}".format(lda_instance.dictionary))
Пример #18
0
def extend_dict_w2v(dict_name, n_words=10):
    """
    Extends bloom dictionary with similar words using a pre-trained
    embedding. Default model: https://fasttext.cc/docs/en/crawl-vectors.html
    args:
    params: input_params.json
    dict_name: name of Bloom dict in params
    n_words: include n_nearest words to subject word.
    """
    model = KeyedVectors.load_word2vec_format(params().paths['w2v_model'], binary=False)
    print("Word2vec model loaded")
    dict_out = copy.deepcopy(params().dicts[dict_name])
    for k, v in params().dicts[dict_name].items():
        for val in v:
            #print('\n'+v)
            try:
                similar_words = [w[0] for w in model.most_similar(positive=val, topn=n_words)]
                dict_out[k].extend(_check_stem_duplicates(similar_words))
                #print('\n',model.most_similar(positive=v))
            except KeyError:
                continue
    return dict_out
Пример #19
0
def merge_lda_u(extend=True,sample_size=0,num_topics=90):
    """Merges uncertainty counts and topic vectors.
    args:
        extend (bool): Use extended set of u-words.
        sample_size (int): Return a random sample of articles.
        num_topics (int): LDA model to use.
    returns:
        DataFrame with columns 'topics' and 'u_count'
    """
    if extend:
        suffix='u_count_extend'
    else:
        suffix='u_count'
    try:
        df = pd.read_hdf(params().paths['doc_topics']+'doc_topics_'+suffix+'.h5', 'table')
        df['date'] = pd.to_datetime(df['date'])
        
        #convert columns to single col list
        df['topics']= df.iloc[:,0:num_topics].values.tolist()
        df.drop(df.columns[0:num_topics], axis=1, inplace=True)
        if sample_size > 0:
            return df.sample(sample_size) 
        return df
    
    except FileNotFoundError:
        print('File not found, merging lda topics and uncertainty counts...')
        df_u = _load_u_count(extend=extend, sample_size=sample_size)
        df = pd.read_hdf(params().paths['doc_topics']+'document_topics.h5', 'table')
        df['topics']= df.iloc[:,0:num_topics].values.tolist()
        df.drop(df.columns[0:num_topics], axis=1, inplace=True)
        df = df.merge(df_u, 'inner', 'article_id')
        df['date'] = pd.to_datetime(df['date'])
        save_topics_to_hdf(df,suffix)

        if sample_size > 0:
            return df.sample(sample_size) 
        return df
Пример #20
0
def corpus2bow(lda_instance):
    """Returns test corpus in bow format: list of (word_id,word_count)
    """
    lda_instance.dictionary[1]
    bow_dict = copy.deepcopy(lda_instance.dictionary.id2token)
    bow_dict = {k: 0 for (k, v) in bow_dict.items()}
    file_path = os.path.join(params().paths['lda'], 'corpus_test.mm')
    mm = gensim.corpora.mmcorpus.MmCorpus(
        file_path)  # `mm` document stream now has random access
    for doc in range(0, mm.num_docs, 1):
        doc_dict = dict(mm[doc])
        for k, v in doc_dict.items():
            bow_dict[k] = bow_dict[k] + v
    bow_list = [(k, v) for k, v in bow_dict.items()]
    return bow_list
Пример #21
0
 def parse_topic_labels(self, name):
     """
     reads hand labeled topics from json file.
     args:
     name (str): name of json file (num_topics must be suffix)
     returns: 
     (dict) with labels
     """
     label_path = os.path.join(params().paths['topic_labels'], 
                               name+str(self.num_topics)+'.json')
       
     with codecs.open(label_path, 'r', encoding='utf-8-sig') as f:
         self.labels = json.load(f)
     self.labels['EP'] = list(set().union(self.labels['EP_int'], self.labels['EP_dk']))
     return self.labels
Пример #22
0
    def validate(self):
        """Method for calculating correlations with macro time-series 
        and index/vix/vdax
        """
        v1x, vix = self.load_vix(f='M')
        data = pd.read_csv(params().paths['input']+'validation.csv', header=0)
        data['date'] = pd.to_datetime(data['date'], format="%Ym%m") + pd.tseries.offsets.MonthEnd(1)
        data = data.set_index('date')

        idx = self.idx.filter(regex='_norm', axis=1)
        
        for var in data.columns:
            data[var] = _normalize(data[var])
            self.corr[var] = \
                (_calc_corr(idx, data[var]),
                 _calc_corr(v1x, data[var]),
                 _calc_corr(vix, data[var]))
        return self.corr
Пример #23
0
def save_models(lda_instance):

    # Save all models in their respective folder
    for i, lda_model in enumerate(lda_instance.lda_models):
        try:
            folder_path = os.path.join(
                params().paths['lda'],
                'lda_model_' + str(lda_model.num_topics))
            file_path = os.path.join(folder_path, 'trained_lda')
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            lda_model.save(file_path)
            print("LDA-model #{} saved ({} topics)".format(
                i, lda_model.num_topics))
        except FileNotFoundError:
            print("Error: LDA-file not found")
        except IndexError:
            print("Error: List index out of range")
Пример #24
0
def docs2bow(sample_size=2000):
    file_path = os.path.join(params().paths['lda'], 'corpus.mm')
    mm = gensim.corpora.mmcorpus.MmCorpus(
        file_path)  # `mm` document stream now has random access
    if sample_size is not None:
        sample = [random.randint(0, mm.num_docs) for i in range(sample_size)]
        corpus_bow = []
        for doc in sample:
            corpus_bow.append(mm[doc])
    else:
        corpus_bow = []
        for doc in range(0, mm.num_docs, 1):
            corpus_bow.append(mm[doc])
    word_ids = [item for sublist in corpus_bow for item in sublist]
    df = pd.DataFrame(word_ids, columns=['word', 'count'], dtype='int')
    df = df.groupby(['word'])['count'].sum().reset_index()
    bow = [tuple(x) for x in df.values]
    return bow
Пример #25
0
def uncertainty_count(extend=True, workers=16):
    """
    Counts u-words in articles. Saves result as HDF to disk.
    args:
        extend (bool): Use extended set of u-words
    """
    if extend:
        U_set = set(list(params().dicts['uncertainty_ext'].values())[0])
        filename = params().filenames['parsed_news_uc_ext']
    else:
        U_set = set(list(params().dicts['uncertainty'].values())[0])
        filename = params().filenames['parsed_news_uc']

    print(U_set)
    #get parsed articles
    df = read_h5py(os.path.join(params().paths['parsed_news'],
                               params().filenames['parsed_news']))

    #stem articles
    with Pool(workers) as pool:
        df['body_stemmed'] = pool.map(_stemtext,
                                      df['body'].values.tolist())
    
    #compare to dictionary
    with Pool(workers) as pool:
        df['u_count'] = pool.map(partial(_count, 
                                 word_set=U_set), 
                                 df['body_stemmed'].values.tolist())
    
    
    N_list = list(params().dicts['negations'].values())[0]
    with Pool(workers) as pool:
        df['n_count'] = pool.map(partial(_count_n, 
                             word_list=N_list), 
                             df['body'])
        
    #save to disk
    df.drop(columns='body_stemmed', inplace=True)
    outpath = os.path.join(params().paths['enriched_news'],filename)
    with h5py.File(outpath, 'w') as hf:
        string_dt = h5py.string_dtype(encoding='utf-8')
        hf.create_dataset('parsed_strings', data=df, dtype=string_dt)
Пример #26
0
 def load_bigrams(self):
     if os.path.isfile(os.path.join(params().paths['lda'],'phrases.pkl')):
         phrases = gensim.utils.SaveLoad.load(os.path.join(params().paths['lda'],'phrases.pkl'))
         self.bigram_phraser = gensim.models.phrases.Phraser(phrases)
         print("Bigram phraser loaded")
     else:
         print("Bigram phraser not found, training")
         with h5py.File(os.path.join(params().paths['lda'], params().filenames['lda_cleaned_text']), 'r') as hf:
             hf = hf['parsed_strings'][:]
             articles_to_phrasing  = [a[1].split() for a in hf]
         phrases = gensim.models.phrases.Phrases(articles_to_phrasing, params().options['lda']['no_below'], threshold=100)
         phrases.save(os.path.join(params().paths['lda'],'phrases.pkl'), separately=None, sep_limit=10485760, ignore=frozenset([]), pickle_protocol=2)
         self.bigram_phraser = gensim.models.phrases.Phraser(phrases)
         print("Bigram phraser loaded")
Пример #27
0
def import_scraped_articles():
    path = params().paths['scraped']  # use your path
    all_files = glob.glob(path + "/*/scraped*.csv")

    dfs = []
    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, sep=";")
        dfs.append(df)
    df = pd.concat(dfs, axis=0, ignore_index=True)
    df['date'] = df['date'].replace('404', np.nan)
    df = df.dropna(subset=['date'])
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop_duplicates(subset=['headline_web', 'date'])
    df = df.drop(columns=['Unnamed: 0', 'Unnamed: 1', 'sep=', 'headline_q'])
    # url ledelse/ is missing proper date (date is date of scrape), drop these
    df = df.loc[df['url'].str.find('/ledelse/') == -1]
    df.rename(columns={
        'headline_web': 'headline',
        'bodytext': 'body',
        'url': 'byline_alt'
    },
              inplace=True)
    return df
Пример #28
0
def print_topics(lda_instance, topn=30, unique_sort=True):
    lda_model = lda_instance.lda_model

    csv_path = os.path.join(params().paths['lda'],
                            'topic_words' + str(lda_model.num_topics) + '.csv')
    header = ['topic_' + str(x) for x in range(lda_model.num_topics)]

    if not unique_sort:
        word_lists = []
        for t in range(lda_model.num_topics):
            word_list = lda_model.show_topic(t, topn)
            if not len(word_lists):
                word_list = [[w[0]] for w in word_list]
                word_lists = word_list
            else:
                word_list = [w[0] for w in word_list]
                for i in range(topn):
                    word_lists[i].append(word_list[i])
        with open(csv_path, mode='w', newline='\n',
                  encoding='utf-8-sig') as csv_out:
            csvwriter = csv.writer(csv_out, delimiter=',')
            csvwriter.writerow(header)
            for i in range(topn):
                csvwriter.writerow(word_lists[i])
        return word_lists

    else:
        df = get_unique_words(lda_instance, topn)

        df = df[['word']]
        df.index = pd.MultiIndex.from_arrays(
            [df.index.get_level_values(1),
             df.groupby(level=1).cumcount()],
            names=['token', 'topic'])
        df = df.unstack(level=0)
        df.to_csv(csv_path, header=header, encoding='utf-8-sig', index=False)
        return df
Пример #29
0
    def load_vix(self,frq='M'):
        """Loads vix and vdax from csv files
        args:
        frq = observation frequency for vix
        returns: (df,df)
        """
        # v1x = pd.read_csv(params().paths['input']+'v1x_monthly.csv',
        #                   names=['date','v1x'], header=0)
        #
        # v1x['date'] = pd.to_datetime(v1x['date'])
        # v1x.set_index('date', inplace=True)
        # v1x = v1x[str(self.start_year):str(self.end_year)+self.end_str]
        # v1x['v1x'] = _normalize(v1x['v1x'])

        vix = pd.read_csv(params().paths['input'] + 'vixcurrent.csv',
                          names=['date', 'vix'], header=1)
        vix['date'] = pd.to_datetime(vix['date'])
        vix.set_index('date', inplace=True)
        vix = vix.resample(frq).mean()
        vix.columns = vix.columns.get_level_values(0)
        vix = vix[str(self.start_year):str(self.end_year)+self.end_str]
        vix['vix'] = _normalize(vix['vix'])
                   
        return vix
Пример #30
0
    def load_and_clean_body_text(self):
        print("No existing pre-processed data found. Loading h5-file for preprocessing")

        df = read_h5py(os.path.join(params().paths['parsed_news'],
                                    params().filenames['parsed_news']))

        try:
            self.articles.extend(list(df['body'].values))
            self.article_id.extend(list(df['article_id'].values))
        except KeyError:
            print("File doesn't contain any body-text")

        # Perform LDA on smaller sample, just for efficiency in case of testing...
        if self.test is True:
            random.seed(1)
            test_idx = random.sample(range(0, len(self.articles)), params().options['lda']['test_size'])
            self.articles = [self.articles[i] for i in test_idx]
            self.article_id = [self.article_id[i] for i in test_idx]

        # Pre-process LDA-docs
        if len(self.articles):
            print("\tProcessing {} documents for LDA".format(len(self.articles)))
            with Pool(params().options['threads']) as pool:
                self.articles = pool.map(partial(preprocess, 
                                                 lemmatizer=self.lemmatizer),
                                                 self.articles)

            print("\tSaving cleaned documents")
            folder_path = params().paths['lda']
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            file_path = os.path.join(folder_path, params().filenames['lda_cleaned_text'])
            with h5py.File(file_path, 'w') as hf:
                data = np.array(list(zip(self.article_id, self.articles)), dtype=object)
                string_dt = h5py.string_dtype(encoding='utf-8')
                hf.create_dataset('parsed_strings', data=data, dtype=string_dt)
        # Train bigram model
        self.load_bigrams()