async def calc_cos(self): """This function calculates a cos similarity score """ count = 1 total = len(get_file_list()) file_name = "cos_score.csv" file_path = get_score_file_path(file_name) input_stream = self.get_input_stream() async for data in input_stream: sentences = data['text'] sent_mat = self._model.transform(sentences) cosine_similarities = [] for i in range(30): cosine_similarities.append(linear_kernel(self._topic_sparse_mat['topic' + str(i)], sent_mat).flatten()) n = len(sentences) for i in range(n): if len(sentences[i]) > 20: score_dict = {'10k_path': [data['path']], 'sentence_index' : [i], 'joined tokens' : [sentences[i]]} for j in range(30): score_dict['topic'+str(j)+' score'] = cosine_similarities[j][i] if cosine_similarities[j][i] > 0.1 else None if os.path.isfile(file_path): df = pd.DataFrame(score_dict) if df.iloc[0, 3:32].sum() > 0: with open(file_path, 'a', encoding = 'utf-8') as f: df.to_csv(f, encoding = 'utf-8', index = False, header=False) else: df = pd.DataFrame(score_dict) if df.iloc[0, 3:32].sum() > 0: df.to_csv(file_path, encoding = 'utf-8', index = False) print_progress(count, total) count += 1 print('')
async def train_dictionary(self): """This function trains a new gensim dictionary from the corpus. """ input_stream = self.get_input_stream() # Train the dictionary count = 1 total = len(get_file_list()) async for data in input_stream: await self.run(data) print_progress(count, total) count += 1 print("") self.save_dict()
async def tokenise_sentences(): """ This function tokenises sentences contained in the data files and yeilds them one at a time Yields: str: A single sentence """ files = get_file_list() file_pipeline = ReadFilePipeline(input_stream=files) sentence_pipeline = SentencePipeline( input_stream=file_pipeline.output_stream()) async for sentence in sentence_pipeline.output_stream(): yield sentence
async def generate_dataset(cls): """This function is used to create a dataset for the LightTag platform """ #build the pipeline data_stream = cls.get_input_stream(cls.SCHEMA) pipeline = cls() # create the dataset count = 1 total = len(get_file_list()) async for data in data_stream: await pipeline.run(data) print_progress(count, total) count += 1 print("")
async def prepare_data(cls): """Runs a pipeline to generate data for training a TF-IDF model and saves it to a file. """ # Build the pipeline data_stream = cls.get_input_stream(cls.SCHEMA) pipeline = cls() # Process the data count = 1 total = len(get_file_list()) async for data in data_stream: await pipeline.run(data) print_progress(count, total) count += 1 print("")
def tokenise_words(): """ This function tokenises words contained in sentences Yields: str: A single word """ lemma_pipeline = LemmaPipeline() word_pipeline = WordPipeline([lemma_pipeline]) sentence_pipeline = SentencePipeline([word_pipeline]) file_pipeline = ReadFilePipeline([sentence_pipeline]) loop = asyncio.get_event_loop() files = get_file_list() print(len(files)) for file_path in files: loop.run_until_complete(file_pipeline.run(file_path)) for root in lemma_pipeline.result: yield root loop.close()
async def prepare_data(cls): """Runs a pipeline to generate data for training an LDA model and saves it to a file. """ # Build the pipeline dictionary_input = DictionaryPipeline.get_input_stream(cls.SCHEMA) dictionary = DictionaryPipeline(input_stream=dictionary_input) bow_stream = dictionary.output_stream() lda_corpus_pipeline = cls() print("Did not find any corpus data. preparing now") # create the data count = 1 total = len(get_file_list()) async for data in bow_stream: await lda_corpus_pipeline.run(data) print_progress(count, total) count += 1 print("")
async def calc_risk(self): """This function calculates a risk score """ dictionary, model = self.load_model() # for this model, risk topic id is 15 input_stream = self.get_input_stream() # Train the dictionary count = 1 total = len(get_file_list()) tickers = [] filing_dates = [] total_num_sent = [] total_risk_sent = [] total_risk_top1 = [] total_risk_top2 = [] total_risk_top3 = [] total_risk_top4 = [] total_risk_words = [] total_uncertain_words = [] weighted_risk_score = [] weighted_rank_score = [] async for data in input_stream: list_of_tokenized_words = data['text'] path = data['path'].split('\\') ticker = path[1] filing_date = path[-1][:10] total_sent = len(list_of_tokenized_words) scores = [] ranks = [] risk_num = 0 risk_top1 = 0 risk_top2 = 0 risk_top3 = 0 risk_top4 = 0 risk_word = 0 uncertain_word = 0 for tokens in list_of_tokenized_words: bow_vector = dictionary.doc2bow(tokens) for idx, (topic_id, score) in enumerate( sorted(model[bow_vector], key=lambda tup: -1 * tup[1])): if topic_id == 15: scores.append(score) ranks.append(idx + 1) risk_num = risk_num + 1 if idx < 1: risk_top1 += 1 elif idx < 2: risk_top2 += 1 elif idx < 3: risk_top3 += 1 elif idx < 4: risk_top4 += 1 for token in tokens: if re.match(self.RISK_WORD, token): risk_word += 1 elif re.match(self.UNCERTAIN_WORD, token): uncertain_word += 1 tickers.append(ticker) filing_dates.append(filing_date) total_num_sent.append(total_sent) total_risk_sent.append(risk_num) total_risk_top1.append(risk_top1) total_risk_top2.append(risk_top2) total_risk_top3.append(risk_top3) total_risk_top4.append(risk_top4) total_risk_words.append(risk_word) total_uncertain_words.append(uncertain_word) if len(scores) > 0: weighted_risk_score.append(np.mean(scores)) weighted_rank_score.append(np.mean(ranks)) else: weighted_risk_score.append(0) weighted_rank_score.append(0) print_progress(count, total) count += 1 print('') df = pd.DataFrame({ 'ticker': tickers, 'filing dates': filing_dates, 'total number of sentences': total_num_sent, 'total number of risk sentences': total_risk_sent, 'score rank 1': total_risk_top1, 'score rank 2': total_risk_top2, 'score rank 3': total_risk_top3, 'score rank 4': total_risk_top4, 'average of risk score': weighted_risk_score, 'average of ranks': weighted_rank_score, 'total number of risk word': total_risk_words, 'total number of uncertain word': total_uncertain_words }) file_name = "risk_score.csv" file_path = get_score_file_path(file_name) df.to_csv(file_path)
def get_input_stream(): """An input stream for the pipeline """ file_paths = sorted(get_file_list()) for file_path in file_paths: yield file_path