Python print_progress示例，ucla_topic_analysis.data.coroutines.print_progress Python示例

示例#1

0

显示文件

文件： tfidf_score.py 项目： swang666/applied-finance-project

 async def calc_cos(self):
     """This function calculates a cos similarity score
     """
     count = 1
     total = len(get_file_list())
     file_name = "cos_score.csv"
     file_path = get_score_file_path(file_name)
     input_stream = self.get_input_stream()
     async for data in input_stream:
         sentences = data['text']
         sent_mat = self._model.transform(sentences)
         cosine_similarities = []
         for i in range(30):
             cosine_similarities.append(linear_kernel(self._topic_sparse_mat['topic' + str(i)], sent_mat).flatten())
         n = len(sentences)
         for i in range(n):
             if len(sentences[i]) > 20:
                 score_dict = {'10k_path': [data['path']],
                         'sentence_index' : [i],
                         'joined tokens' : [sentences[i]]}
                 for j in range(30):
                     score_dict['topic'+str(j)+' score'] = cosine_similarities[j][i] if cosine_similarities[j][i] > 0.1 else None
                 if os.path.isfile(file_path):
                     df = pd.DataFrame(score_dict)
                     if df.iloc[0, 3:32].sum() > 0:
                         with open(file_path, 'a', encoding = 'utf-8') as f:
                             df.to_csv(f, encoding = 'utf-8', index = False, header=False)
                 else:
                     df = pd.DataFrame(score_dict)
                     if df.iloc[0, 3:32].sum() > 0:
                         df.to_csv(file_path, encoding = 'utf-8', index = False)
         print_progress(count, total)
         count += 1
     print('')

示例#2

0

显示文件

def download(tickers):
    path = get_filings_folder()
    dl = Downloader(path)
    n = len(tickers)
    for i in range(n):
        print_progress(i, n)
        if os.path.exists('../Filings/sec_edgar_filings/' +
                          tickers[i]) == False:
            dl.get_10k_filings(tickers[i])

示例#3

0

显示文件

文件： dictionary.py 项目： swang666/applied-finance-project

 async def train_dictionary(self):
     """This function trains a new gensim dictionary from the corpus.
     """
     input_stream = self.get_input_stream()
     # Train the dictionary
     count = 1
     total = len(get_file_list())
     async for data in input_stream:
         await self.run(data)
         print_progress(count, total)
         count += 1
     print("")
     self.save_dict()

示例#4

0

显示文件

文件： light_tag.py 项目： swang666/applied-finance-project

    async def generate_dataset(cls):
        """This function is used to create a dataset for the LightTag platform
        """
        #build the pipeline
        data_stream = cls.get_input_stream(cls.SCHEMA)
        pipeline = cls()

        # create the dataset
        count = 1
        total = len(get_file_list())
        async for data in data_stream:
            await pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")

示例#5

0

显示文件

文件： lda_corpus.py 项目： swang666/applied-finance-project

    def __iter__(self):
        """Generates data from the corups file.

        Yields:
            :obj:`list` of :obj:`(int, int)`)
        """
        completed = 1
        with open(self.get_file_path(), "r") as data_file:
            for line in data_file:
                data = json.loads(line)
                if data["label"] == self._mode:
                    for document in data.get("text"):
                        yield document
                        print_progress(completed, len(self))
                        completed += 1
        print("")

示例#6

0

显示文件

    async def prepare_data(cls):
        """Runs a pipeline to generate data for training a TF-IDF model and
        saves it to a file.
        """
        # Build the pipeline
        data_stream = cls.get_input_stream(cls.SCHEMA)
        pipeline = cls()

        # Process the data
        count = 1
        total = len(get_file_list())
        async for data in data_stream:
            await pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")

示例#7

0

显示文件

文件： lda_corpus.py 项目： swang666/applied-finance-project

    async def prepare_data(cls):
        """Runs a pipeline to generate data for training an LDA model and
        saves it to a file.
        """
        # Build the pipeline
        dictionary_input = DictionaryPipeline.get_input_stream(cls.SCHEMA)
        dictionary = DictionaryPipeline(input_stream=dictionary_input)
        bow_stream = dictionary.output_stream()
        lda_corpus_pipeline = cls()

        print("Did not find any corpus data. preparing now")
        # create the data
        count = 1
        total = len(get_file_list())
        async for data in bow_stream:
            await lda_corpus_pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")

示例#8

0

显示文件

    async def calc_risk(self):
        """This function calculates a risk score
        """
        dictionary, model = self.load_model()
        # for this model, risk topic id is 15
        input_stream = self.get_input_stream()
        # Train the dictionary
        count = 1
        total = len(get_file_list())
        tickers = []
        filing_dates = []
        total_num_sent = []
        total_risk_sent = []
        total_risk_top1 = []
        total_risk_top2 = []
        total_risk_top3 = []
        total_risk_top4 = []
        total_risk_words = []
        total_uncertain_words = []
        weighted_risk_score = []
        weighted_rank_score = []
        async for data in input_stream:
            list_of_tokenized_words = data['text']
            path = data['path'].split('\\')
            ticker = path[1]
            filing_date = path[-1][:10]
            total_sent = len(list_of_tokenized_words)
            scores = []
            ranks = []
            risk_num = 0
            risk_top1 = 0
            risk_top2 = 0
            risk_top3 = 0
            risk_top4 = 0
            risk_word = 0
            uncertain_word = 0
            for tokens in list_of_tokenized_words:
                bow_vector = dictionary.doc2bow(tokens)
                for idx, (topic_id, score) in enumerate(
                        sorted(model[bow_vector],
                               key=lambda tup: -1 * tup[1])):
                    if topic_id == 15:
                        scores.append(score)
                        ranks.append(idx + 1)
                        risk_num = risk_num + 1
                        if idx < 1:
                            risk_top1 += 1
                        elif idx < 2:
                            risk_top2 += 1
                        elif idx < 3:
                            risk_top3 += 1
                        elif idx < 4:
                            risk_top4 += 1
                for token in tokens:
                    if re.match(self.RISK_WORD, token):
                        risk_word += 1
                    elif re.match(self.UNCERTAIN_WORD, token):
                        uncertain_word += 1
            tickers.append(ticker)
            filing_dates.append(filing_date)
            total_num_sent.append(total_sent)
            total_risk_sent.append(risk_num)
            total_risk_top1.append(risk_top1)
            total_risk_top2.append(risk_top2)
            total_risk_top3.append(risk_top3)
            total_risk_top4.append(risk_top4)
            total_risk_words.append(risk_word)
            total_uncertain_words.append(uncertain_word)
            if len(scores) > 0:
                weighted_risk_score.append(np.mean(scores))
                weighted_rank_score.append(np.mean(ranks))
            else:
                weighted_risk_score.append(0)
                weighted_rank_score.append(0)
            print_progress(count, total)
            count += 1
        print('')

        df = pd.DataFrame({
            'ticker':
            tickers,
            'filing dates':
            filing_dates,
            'total number of sentences':
            total_num_sent,
            'total number of risk sentences':
            total_risk_sent,
            'score rank 1':
            total_risk_top1,
            'score rank 2':
            total_risk_top2,
            'score rank 3':
            total_risk_top3,
            'score rank 4':
            total_risk_top4,
            'average of risk score':
            weighted_risk_score,
            'average of ranks':
            weighted_rank_score,
            'total number of risk word':
            total_risk_words,
            'total number of uncertain word':
            total_uncertain_words
        })

        file_name = "risk_score.csv"
        file_path = get_score_file_path(file_name)
        df.to_csv(file_path)