예제 #1
0
    def get_doc_metadata(self, doc):
        phrase_counter = Counter()
        try:
            for phrase in doc._.phrases:
                if self._include_chunks:
                    for chunk in phrase.chunks:
                        phrase_counter[str(chunk)] += (
                            phrase.rank + self._rank_smoothing_constant)
                else:
                    phrase_counter[phrase.text] += phrase.count * (
                        phrase.rank + self._rank_smoothing_constant)
        except:  # Support for pytextrank<3
            import pytextrank
            tr = pytextrank.TextRank()
            tr.doc = doc
            phrases = tr.calc_textrank()
            for phrase in phrases:
                if self._include_chunks:
                    for chunk in phrase.chunks:
                        phrase_counter[str(chunk)] += (
                            phrase.rank + self._rank_smoothing_constant)
                else:
                    phrase_counter[phrase.text] += phrase.count * (
                        phrase.rank + self._rank_smoothing_constant)

        return phrase_counter
예제 #2
0
def main(args):
    """
    For each publication, search for the abstract and extract key phrases 
    if abstract exists and is not null. Report if the abstract is missing.
    """
    graph = rc_graph.RCGraph("keyphr")
    graph.load_stopwords()

    # add PyTextRank into the spaCy pipeline
    nlp = spacy.load("en_core_web_sm")
    tr = pytextrank.TextRank(logger=None)
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    for partition, pub_iter in graph.iter_publications(graph.BUCKET_STAGE,
                                                       filter=args.partition):
        pub_list = []

        for pub in tqdm(pub_iter, ascii=True, desc=partition[:30]):
            extract_phrases(graph, nlp, partition, pub, pub_list)

        graph.write_partition(graph.BUCKET_STAGE, partition, pub_list)

    # report errors
    status = "{} publications parsed keyphrases from abstracts".format(
        graph.publications.key_hits)
    trouble = "publications which could not parse keyphrases"
    graph.report_misses(status, trouble)
예제 #3
0
def InitNLPPyTextRank():
    nlpPyRank = spacy.load("en_core_web_sm")
    tr = pytextrank.TextRank()
    # add PyTextRank to the spaCy pipeline
    nlpPyRank.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    return nlpPyRank
 def __init__(self, sl_flag=1, should_remove_stop_words=True):
     """
     Initialize handler.
     :param sl_flag: stem/lemmatize flag - 0: stem 1: lemmatize
     :param should_remove_stop_words: default: True
     """
     nltk.download('stopwords')
     nltk.download('punkt')
     nltk.download('wordnet')
     self.sl_flag = sl_flag
     if sl_flag not in [0, 1]:
         raise ValueError("Invalid sl flag provided")
     self.should_remove_stop_words = should_remove_stop_words
     self.porter_stemmer = PorterStemmer()
     self.lemmatizer = WordNetLemmatizer()
     self.stop_words = set(stopwords.words('english'))
     self.punctuation_filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
     self.tr = pytextrank.TextRank()
     # FIXME: sanga.s add abstractive techniques.
     # pytext rank computes similarity graph and returns
     # important sentence as the one that is most similar to all.
     self.spacy_pipeline = spacy.load("en_core_web_sm")
     self.spacy_pipeline.add_pipe(self.tr.PipelineComponent, name="TextRank", last=True)
     # FIXME
     self.word2vec_trained = True
     if not self.word2vec_trained:
         self.train_word2vec()
     self.word2vec_model = self.retrieve_model()
예제 #5
0
def text_summary(text, model_en, model_fr):
    # Initialize dictionary to contain all the topics of the text considered
    summ = {}
    # Initialize TextRank (Graph based) algorithm for text semantic identification
    tr = pytextrank.TextRank()
    # Load `french and english pre-trained Spacy models
    nlp_en = model_en
    nlp_fr = model_fr
    try:
        # If english model has not been added to the nlp pipe, do it
        nlp_en.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    except ValueError:
        pass
    try:
        # If French model has not been added to the nlp pipe, do it
        nlp_fr.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    except ValueError:
        pass

    # Detect language to know which pipelining to choose
    if detect(text) == 'en':
        doc = nlp_en(text)
    # In my case if the language detected is not english it is most certainly French
    else:
        doc = nlp_fr(text)

    # Retrieve top 20 tags considered as most reflecting of the content of the text based on Spacy model
    tags = doc._.phrases[0:21]
    # Save in initialized dictionary
    summ["tags"] = tags

    return summ
예제 #6
0
 def setUp(self):
     """set up a spaCy pipeline"""
     self.nlp = spacy.load("en_core_web_sm")
     self.tr = pytextrank.TextRank(logger=None)
     self.nlp.add_pipe(self.tr.PipelineComponent,
                       name="textrank",
                       last=True)
예제 #7
0
def breakIntoWordsAndPhrases(text):
    nlp = spacy.load("en_core_web_sm")
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    doc = nlp(text)
    hashset = set()
    for p in doc._.phrases:
        for q in p.chunks:
            hashset.add(str(q))

    indextophrases = {}
    for s in hashset:
        indextophrases[text.find(s)] = s

    i = 0
    end = len(text)
    chunks = []
    string = ""
    while i < end:
        if i in indextophrases:
            chunks.append(string)
            chunks.append(indextophrases[i])
            i += len(indextophrases[i])
            string = ""
        else:
            string += text[i]
            i += 1
            if i == end: chunks.append(string)

    return chunks
예제 #8
0
    def key_phrases(self):
        # example text
        text = self.text

        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

        doc = nlp(text)

        # to examine the top-ranked phrases in the document:
        #for p in doc._.phrases:
        #print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
        #print(p.chunks)

        sentences = []
        quiz_length = random.randint(4, 7)

        for sentence in doc._.textrank.summary(limit_phrases=1,
                                               limit_sentences=quiz_length):
            sentences.append(str(sentence).replace("\n", ""))

        return sentences
예제 #9
0
def get_TextRank(article, n):
    nlp = spacy.load("en_core_web_sm")

    # add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(article)

    # examine the top-ranked phrases in the document
    topKeywords = []

    for i in range(n):
        # print(type(doc._.phrases[i].text))
        # print(type(str(doc._.phrases[i].text)))
        topKeywords.append(doc._.phrases[i].text)
    '''
    for p in doc._.phrases:
        #print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
        print(type(p.text))
    print('***********')
    '''
    # print(topKeywords)
    # print(type(topKeywords))
    # print(type(topKeywords[0]))
    return topKeywords
예제 #10
0
 def __init__(self, model="distilbert-base-nli-stsb-mean-tokens"):
     self.model = SentenceTransformer(model)
     self.tokenizer = spacy.load("en_core_web_lg")
     tr = pytextrank.TextRank()
     self.tokenizer.add_pipe(tr.PipelineComponent,
                             name="textrank",
                             last=True)
예제 #11
0
def extract_keyterms():
    nlp = en_core_web_sm.load()
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)

    texts = glob.glob("./text-extraction/*.txt")
    for text in texts:
        final_result_name = './results/' + get_file_name(text).replace(
            'txt', 'csv')
        print('Extracting key terms from text: ' + get_file_name(text) + '.')
        print('This process may take time, please wait ...')
        # Reading the text file
        arquivo = open(text, 'r', encoding='utf8')
        text = arquivo.read()
        doc = nlp(text)
        # examine the top-ranked phrases in the document
        dictFinal = defaultdict(list)
        for p in doc._.phrases:
            # print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))
            if (len(p.text) > 3):
                dictFinal['phrases'].append(p.text)
                dictFinal['count'].append(p.count)
                dictFinal['rank'].append(p.rank)

        dictFinal

        print('process finished, the result is in the ' + final_result_name +
              ' file')
        df = pd.DataFrame(dictFinal)
        df.sort_values(by=['rank', 'count'], ascending=False, inplace=True)
        df.to_csv(final_result_name)
예제 #12
0
def pytextrank_output(text, ratio_val):
    #Insert path to "en_core_web_sm-2.2.5" in the local file
    nlp = spacy.load("insert path here")
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    # add PyTextRank into the spaCy pipeline
    tr = pytextrank.TextRank(logger=None)
    text = text
    bad_chars = [';', ':', '!', "*", '•', '’', '\ufeff']
    for i in bad_chars:
        text = text.replace(i, '')
    doc = nlp(text)
    split = int(len(text.split('.')) * ratio_val)
    list1 = ''
    for sent in doc._.textrank.summary(limit_phrases=split,
                                       limit_sentences=split):
        list1 = list1 + str(sent)
    return list1
 def __init__(self, argv):
     super().__init__(command=__file__, argv=argv)
     spacy.prefer_gpu()
     self.nlp = spacy.load('en_core_web_sm')
     #coref = neuralcoref.NeuralCoref(self.nlp.vocab)
     #self.nlp.add_pipe(coref, name='neuralcoref');
     tr = pytextrank.TextRank()
     self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
     self.__text_processor = TextProcessor(self.nlp, self._driver)
     self.create_constraints()
예제 #14
0
def run_textrank_model(
    entry_id, phrase_limit
):  # this will extract paragraph and header text from given json file and extract the topics from that
    mycol = refer_collection()
    comp_data_entry = mycol.find({"_id": entry_id})
    data = [i for i in comp_data_entry]
    print("textrank model started", str(data[0]['_id']), data[0]['link'])
    try:
        h_p_data = data[0]["paragraph_text"] + data[0][
            "header_text"]  # do topic extraction on paragraph and header text

        combined_text = " ".join(h_p_data)

        # load a spaCy model, depending on language, scale, etc.
        nlp = spacy.load("en_core_web_sm")

        # add PyTextRank to the spaCy pipeline
        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        nlp.max_length = 150000000
        doc = nlp(combined_text)

        # examine the top-ranked phrases in the document
        tr_results = []
        tr_words = []
        for p in doc._.phrases[:phrase_limit]:
            tr_results.append([p.rank, p.count, p.text])
            tr_words.append(p.text)
            # print(p.chunks)
        # summery_res = []
        # for sent in doc._.textrank.summary(limit_sentences=summery_limit):
        #     print(sent)
        #     summery_res.append(str(sent))
        # print(summery_res)
        if (len(tr_words)):
            print(tr_words)
            mycol.update_one({'_id': entry_id},
                             {'$set': {
                                 'textrank_results': tr_words
                             }})
            print("Successfully extended the data entry with textrank results",
                  entry_id)
        else:
            mycol.update_one({'_id': entry_id},
                             {'$set': {
                                 'textrank_results': []
                             }})
            print("vocabulary is empty")
    except Exception:
        mycol.update_one({'_id': entry_id}, {'$set': {'textrank_results': []}})
        print("vocabulary is empty")


# run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
def test_extraction_with_TEXTRANK():
    tr = pytextrank.TextRank()
    pos_el = spacy.load("el_core_news_md")

    pos_el.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    while True:
        input_doc = input()
        if input_doc == "end":
            break
        output = extract_keywords_TEXTRANK(pos_el, input_doc, 5)
        print(output)
예제 #16
0
def phrase_rank(text, count):
    nlp = spacy.load("en_core_web_sm")

    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(text)
    res = []
    for phrase in doc._.phrases[:count]:
        res.append(str(phrase).capitalize())

    return res
예제 #17
0
파일: keywords.py 프로젝트: librairy/nlp
 def get_keywords(self, text):
     lang = self.get_language(text)
     model = lang + "_core_news_sm"
     print("loading model: '" + model + "'")
     nlp = spacy.load(lang)
     tr = pytextrank.TextRank()
     nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
     doc = nlp(text)
     key_words = []
     for p in doc._.phrases:
         print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
         key_words.append(p.text)
     return key_words[:10]
예제 #18
0
def yelpTrendyPhrases(business_id='Iq7NqQD-sESu3vr9iEGuTA',
                      periods=12,
                      bagging_periods=3,
                      days_per_period=30,
                      topk=10):
    '''
    1. Get Yelp review texts
    2. Bag review texts within certain period, e.g. 6 peridos (180 days)
    3. Use Textrank to get scores
    4. Return JSON format for the frontend visualization
    '''
    # In Google Colab, run 6 period bagging:
    # CPU times: user 24.5 s, sys: 520 ms, total: 25 s
    # Wall time: 25 s
    # https://colab.research.google.com/drive/1r4uvFA6RNV35lO3JcYoO5Psz_EVhmNu0

    df_reviews = pd.DataFrame(columns=['date', 'text'])
    current_date = datetime.strptime('2018-11-30', '%Y-%m-%d')
    past_date = current_date - timedelta(days=days_per_period * periods - 1)
    getYelpReviews(business_id,
                   starting_date=current_date,
                   ending_date=past_date)

    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load("en_core_web_sm/en_core_web_sm-2.2.5")
    # cutomize lemmatizer
    # https://spacy.io/api/lemmatizer
    # ...
    textrank = pytextrank.TextRank()
    nlp.add_pipe(textrank.PipelineComponent, name="textrank", last=True)

    keywords = []
    for period in range(periods):
        # [starting_date, ending_date] = 180 days
        # or ending_date - staring_date = 179 days
        ending_date = current_date - timedelta(days=days_per_period * period)
        starting_date = ending_date - timedelta(
            days=days_per_period * bagging_periods - 1)

        condition = ((df_reviews['date'] >= starting_date) &
                     (df_reviews['date'] <= ending_date))
        df_texts = df_reviews[condition][['text', 'date']]
        text = " ".join(df_texts['text'].to_list())
        doc = nlp(text)
        for i, p in enumerate(doc._.phrases):
            keywords.append([ending_date, p.rank, p.count, p.text])
            if i >= topk - 1: break
    del [df_reviews]
    df_keywords = pd.DataFrame(keywords,
                               columns=['date', 'rank', 'count', 'keywords'])
    df_keywords = df_keywords['keywords'].value_counts().index[:topk]
def word_bag_list(org_text):
    """Take text and do sum, return sumed sentence list."""
    # load language model
    nlp = spacy.load("en_core_web_sm")
    # init pytextrank, then add pipe
    tr = pytextrank.TextRank(logger=None)
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    # declear text
    doc = nlp(org_text)
    # merge results into one string
    whole_sent = ""
    for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=5):
        whole_sent = whole_sent + repr(sent).rstrip() + " "
    return [whole_sent]
예제 #20
0
def textrank(corpus):
    import pytextrank
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    doc = nlp(corpus)
    print("=" * 50)
    for phrase in doc._.phrases:
        print(phrase)
        print("{:.4f} {:5d}  {}".format(phrase.rank, phrase.count,
                                        phrase.text))
        print(phrase.chunks)
    print("=" * 50)
    for sent in doc._.textrank.summary(limit_phrases=15, limit_sentences=5):
        print(sent)
예제 #21
0
    def __init__(self, name):
        super(App, self).__init__(name)

        # Load models
        print("[INFO] Loading spacy model")
        self.nlp = spacy.load('en_core_web_md')

        # Adding pipe
        print("[INFO] Adding pipe")
        tr = pytextrank.TextRank()
        self.nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

        # Load classifier
        self.classifier = onnxruntime.InferenceSession("./models/classifier.onnx")
예제 #22
0
    def keywords(self, text, ratio=0.2):
        nlp = spacy.load(self.model_name)

        tr = pytextrank.TextRank()
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

        doc = nlp(text)

        if doc._.phrases:
            phrase_count = len(doc._.phrases)
            lower_limit_num = max(int(ratio * phrase_count), 1)
            return dict([(p.text, p.rank) for p in doc._.phrases[:lower_limit_num]])
        else:
            return dict()
예제 #23
0
def get_key_phrases(textstr):
    nlp = spacy.load("en_core_web_sm")
    doc_list = []

    # add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    doc = nlp(textstr)

    # examine the top-ranked phrases in the document
    for p in doc._.phrases:
        #print("{}".format(p.text))
        doc_list.append(p)
    return doc_list
예제 #24
0
def get_keywords(text):
    # load a spaCy model, depending on language, scale, etc.
    lang = get_language(text)
    os.system('python3 -m spacy download ' + lang)
    nlp = spacy.load(lang)
    nlp.max_length = 29204346
    # add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    doc = nlp(text)
    # examine the top-ranked phrases in the document
    key_words = []
    for p in doc._.phrases:
        #print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
        key_words.append(p.text)
    return key_words
예제 #25
0
def setup(base_path=".", testing=False):
    """
    add PyTextRank into the spaCy pipeline, then set up the input
    directory path for test vs. production env
    """
    nlp = spacy.load("en_core_web_sm")
    tr = pytextrank.TextRank(logger=None)

    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    if testing:
        resource_path = Path(base_path) / "example/pub"
    else:
        resource_path = Path(base_path) / "resources/pub"

    return nlp, resource_path
예제 #26
0
    def generate_base_map(self, text):
        tr = pytextrank.TextRank()
        self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
        doc = self.nlp(text.lower())
        aux_key = self.get_key_phrases(doc, 10)
        key_phrases = self.process_key_phrases(aux_key)
        key = [str(word) for word in key_phrases]

        clus = self.cluster_texts(key, int(len(key[0]) / 2))
        key = array(key)

        base_map = dict()
        for label, indexs in clus.items():
            if len(key[indexs].tolist()) > 1:
                base_map[key[indexs][0]] = key[indexs].tolist()[1:]

        return base_map
예제 #27
0
def run_textrank_model(
    posts, phrase_limit, summery_limit
):  # this will extract paragraph and header text from given json file and extract the topics from that

    # data_words = list(sent_to_words(posts))
    # data_words_nostops = remove_stopwords(data_words)
    # data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ'])
    # print(data_lemmatized)
    # all_tokens = [j for i in data_lemmatized for j in i]
    # combined_text = " ".join(all_tokens)
    combined_text = " ".join(posts)
    # combined_text = h_p_data
    # print(combined_text)
    print("running textrank model")
    # load a spaCy model, depending on language, scale, etc.
    nlp = spacy.load("en_core_web_sm")

    # add PyTextRank to the spaCy pipeline
    tr = pytextrank.TextRank()
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
    nlp.max_length = 150000000
    doc = nlp(combined_text)

    # examine the top-ranked phrases in the document
    tr_results = []
    tr_words = []
    for p in doc._.phrases[:phrase_limit]:
        print("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
        tr_results.append([p.rank, p.count, p.text])
        tr_words.append(p.text)
        # print(p.chunks)
    # summery_res = []
    # for sent in doc._.textrank.summary(limit_sentences=summery_limit):
    #     print(sent)
    #     summery_res.append(str(sent))
    # print(tr_results)
    # print(summery_res)
    return tr_words
    # print(summery_res)
    # data[0]['textrank_resutls'] = tr_results  # dump the extracted topics back to the json file
    # data[0]['textrank_summery__resutls'] = summery_res
    # with open(path_to_json, 'w') as outfile:
    #     json.dump(data, outfile)


# run_textrank_model("F://Armitage_project//crawl_n_depth//extracted_json_files//0_www.sureway.com.au_data.json",50,5)
예제 #28
0
    def top_keywords(self, n=50):
        nlp = spacy.load("en_core_web_sm")
        nlp.max_length = 4000000
        pos = ['ADJ', 'NOUN', 'PROPN']
        tr = pytextrank.TextRank(pos_kept=pos)
        nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)
        doc = nlp(self._text)

        keywords = []
        count = 0
        for p in doc._.phrases:
            logging.debug("{:.4f} {:5d}  {}".format(p.rank, p.count, p.text))
            keywords.append(p.text)
            if count == n:
                break
            count += 1
        return keywords
예제 #29
0
    def load_nlp_pipe(self, pipes):
        """
        This function creates and loads all the pipes into the nlp-er

        :pipes: list of pipe names as strings
        """

        for pipe in pipes:
            if pipe == 'sentencizer':  #needs to go before the parser.
                nlp_pipe = self.nlp.create_pipe(pipe)
                self.nlp.add_pipe(nlp_pipe, before='parser')
            elif pipe == 'textrank':
                tr = pytextrank.TextRank()
                self.nlp.add_pipe(tr.PipelineComponent, name=pipe, last=True)
            else:
                nlp_pipe = self.nlp.create_pipe(pipe)
                self.nlp.add_pipe(nlp_pipe)
예제 #30
0
def main():
    nlp = spacy.load("en_core_web_sm")
    tr = pytextrank.TextRank(logger=None)
    nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

    dir_path = Path("resources/pub")
    txt_path = dir_path / "txt"

    task_ids = []
    ray.init()

    for txt_file in tqdm(list(txt_path.glob(f"*txt")),
                         ascii=True,
                         desc=f"extracted text files"):
        id = extract_phrases.remote(txt_file, dir_path, nlp)
        task_ids.append(id)

    ray.get(task_ids)