async def calc_cos(self):
     """This function calculates a cos similarity score
     """
     count = 1
     total = len(get_file_list())
     file_name = "cos_score.csv"
     file_path = get_score_file_path(file_name)
     input_stream = self.get_input_stream()
     async for data in input_stream:
         sentences = data['text']
         sent_mat = self._model.transform(sentences)
         cosine_similarities = []
         for i in range(30):
             cosine_similarities.append(linear_kernel(self._topic_sparse_mat['topic' + str(i)], sent_mat).flatten())
         n = len(sentences)
         for i in range(n):
             if len(sentences[i]) > 20:
                 score_dict = {'10k_path': [data['path']],
                         'sentence_index' : [i],
                         'joined tokens' : [sentences[i]]}
                 for j in range(30):
                     score_dict['topic'+str(j)+' score'] = cosine_similarities[j][i] if cosine_similarities[j][i] > 0.1 else None
                 if os.path.isfile(file_path):
                     df = pd.DataFrame(score_dict)
                     if df.iloc[0, 3:32].sum() > 0:
                         with open(file_path, 'a', encoding = 'utf-8') as f:
                             df.to_csv(f, encoding = 'utf-8', index = False, header=False)
                 else:
                     df = pd.DataFrame(score_dict)
                     if df.iloc[0, 3:32].sum() > 0:
                         df.to_csv(file_path, encoding = 'utf-8', index = False)
         print_progress(count, total)
         count += 1
     print('')
 async def train_dictionary(self):
     """This function trains a new gensim dictionary from the corpus.
     """
     input_stream = self.get_input_stream()
     # Train the dictionary
     count = 1
     total = len(get_file_list())
     async for data in input_stream:
         await self.run(data)
         print_progress(count, total)
         count += 1
     print("")
     self.save_dict()
示例#3
0
async def tokenise_sentences():
    """
    This function tokenises sentences contained in the data files and yeilds
    them one at a time

    Yields:
        str: A single sentence
    """
    files = get_file_list()
    file_pipeline = ReadFilePipeline(input_stream=files)
    sentence_pipeline = SentencePipeline(
        input_stream=file_pipeline.output_stream())
    async for sentence in sentence_pipeline.output_stream():
        yield sentence
    async def generate_dataset(cls):
        """This function is used to create a dataset for the LightTag platform
        """
        #build the pipeline
        data_stream = cls.get_input_stream(cls.SCHEMA)
        pipeline = cls()

        # create the dataset
        count = 1
        total = len(get_file_list())
        async for data in data_stream:
            await pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")
示例#5
0
    async def prepare_data(cls):
        """Runs a pipeline to generate data for training a TF-IDF model and
        saves it to a file.
        """
        # Build the pipeline
        data_stream = cls.get_input_stream(cls.SCHEMA)
        pipeline = cls()

        # Process the data
        count = 1
        total = len(get_file_list())
        async for data in data_stream:
            await pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")
示例#6
0
def tokenise_words():
    """
    This function tokenises words contained in sentences

    Yields:
        str: A single word
    """
    lemma_pipeline = LemmaPipeline()
    word_pipeline = WordPipeline([lemma_pipeline])
    sentence_pipeline = SentencePipeline([word_pipeline])
    file_pipeline = ReadFilePipeline([sentence_pipeline])
    loop = asyncio.get_event_loop()
    files = get_file_list()
    print(len(files))
    for file_path in files:
        loop.run_until_complete(file_pipeline.run(file_path))
        for root in lemma_pipeline.result:
            yield root
    loop.close()
    async def prepare_data(cls):
        """Runs a pipeline to generate data for training an LDA model and
        saves it to a file.
        """
        # Build the pipeline
        dictionary_input = DictionaryPipeline.get_input_stream(cls.SCHEMA)
        dictionary = DictionaryPipeline(input_stream=dictionary_input)
        bow_stream = dictionary.output_stream()
        lda_corpus_pipeline = cls()

        print("Did not find any corpus data. preparing now")
        # create the data
        count = 1
        total = len(get_file_list())
        async for data in bow_stream:
            await lda_corpus_pipeline.run(data)
            print_progress(count, total)
            count += 1
        print("")
示例#8
0
    async def calc_risk(self):
        """This function calculates a risk score
        """
        dictionary, model = self.load_model()
        # for this model, risk topic id is 15
        input_stream = self.get_input_stream()
        # Train the dictionary
        count = 1
        total = len(get_file_list())
        tickers = []
        filing_dates = []
        total_num_sent = []
        total_risk_sent = []
        total_risk_top1 = []
        total_risk_top2 = []
        total_risk_top3 = []
        total_risk_top4 = []
        total_risk_words = []
        total_uncertain_words = []
        weighted_risk_score = []
        weighted_rank_score = []
        async for data in input_stream:
            list_of_tokenized_words = data['text']
            path = data['path'].split('\\')
            ticker = path[1]
            filing_date = path[-1][:10]
            total_sent = len(list_of_tokenized_words)
            scores = []
            ranks = []
            risk_num = 0
            risk_top1 = 0
            risk_top2 = 0
            risk_top3 = 0
            risk_top4 = 0
            risk_word = 0
            uncertain_word = 0
            for tokens in list_of_tokenized_words:
                bow_vector = dictionary.doc2bow(tokens)
                for idx, (topic_id, score) in enumerate(
                        sorted(model[bow_vector],
                               key=lambda tup: -1 * tup[1])):
                    if topic_id == 15:
                        scores.append(score)
                        ranks.append(idx + 1)
                        risk_num = risk_num + 1
                        if idx < 1:
                            risk_top1 += 1
                        elif idx < 2:
                            risk_top2 += 1
                        elif idx < 3:
                            risk_top3 += 1
                        elif idx < 4:
                            risk_top4 += 1
                for token in tokens:
                    if re.match(self.RISK_WORD, token):
                        risk_word += 1
                    elif re.match(self.UNCERTAIN_WORD, token):
                        uncertain_word += 1
            tickers.append(ticker)
            filing_dates.append(filing_date)
            total_num_sent.append(total_sent)
            total_risk_sent.append(risk_num)
            total_risk_top1.append(risk_top1)
            total_risk_top2.append(risk_top2)
            total_risk_top3.append(risk_top3)
            total_risk_top4.append(risk_top4)
            total_risk_words.append(risk_word)
            total_uncertain_words.append(uncertain_word)
            if len(scores) > 0:
                weighted_risk_score.append(np.mean(scores))
                weighted_rank_score.append(np.mean(ranks))
            else:
                weighted_risk_score.append(0)
                weighted_rank_score.append(0)
            print_progress(count, total)
            count += 1
        print('')

        df = pd.DataFrame({
            'ticker':
            tickers,
            'filing dates':
            filing_dates,
            'total number of sentences':
            total_num_sent,
            'total number of risk sentences':
            total_risk_sent,
            'score rank 1':
            total_risk_top1,
            'score rank 2':
            total_risk_top2,
            'score rank 3':
            total_risk_top3,
            'score rank 4':
            total_risk_top4,
            'average of risk score':
            weighted_risk_score,
            'average of ranks':
            weighted_rank_score,
            'total number of risk word':
            total_risk_words,
            'total number of uncertain word':
            total_uncertain_words
        })

        file_name = "risk_score.csv"
        file_path = get_score_file_path(file_name)
        df.to_csv(file_path)
示例#9
0
 def get_input_stream():
     """An input stream for the pipeline
     """
     file_paths = sorted(get_file_list())
     for file_path in file_paths:
         yield file_path