Exemplos de parse_doc em Python, exemplos de pytextrank.parse_doc em Python

Exemplo n.º 1

0

Exibir arquivo

    def generateGraph(text, outputfile, outputdir, plotGraph=False):
        print('Generating Graph...')
        #Start by doing statistical parsing/tagging for
        temp_file = os.path.join(outputdir, 'temp.json')
        path_stage1 = os.path.join(outputdir,
                                   outputfile.split("_")[0] + '_o1.json')
        txtToJson.textTojson(text, temp_file)
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(pytextrank.json_iter(temp_file)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        #Collect and Normalize the key sentences from the parsed doc
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)
        #path_stage2 = path_stage1.replace('o1', 'o2')
        path_stage2 = os.path.join(outputdir, outputfile)
        try:
            os.remove(outputfile)
        except OSError:
            pass
        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
                #print(pytextrank.pretty_print(rl))
        try:
            os.remove(temp_file)
        except OSError:
            pass

        if plotGraph:
            matplotlib.rcParams['figure.figsize'] = (15.0, 15.0)
            networkx.draw_networkx(graph)
            plt.show()
            nx.draw(graph, with_labels=True)
            plt.show()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: try2.py Projeto: shreyks/Megathon2019

def one(text):

    path_stage0 = "tempfile.json"
    path_stage1 = "o1.json"
    path_stage2 = "o2.json"

    f = open("tempfile.json", "w")
    f.write("{\"id\":\"777\", \"text\":\"" + text + "\"}")
    f.close()

    with open(path_stage1, 'w') as f:
        for graf in parse_doc(json_iter(path_stage0)):
            f.write("%s\n" % pretty_print(graf._asdict()))

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    outputs = []
    with open(path_stage2, 'w') as f:
        for rl in normalize_key_phrases(path_stage1, ranks):
            ans = "%s\n" % pretty_print(rl._asdict())
            output = ast.literal_eval(ans)
            outputs.append((output["text"], output["rank"]))

    os.remove("tempfile.json")

    return outputs


# text = "The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."
# print (one("The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."))

Exemplo n.º 3

0

Exibir arquivo

Arquivo: pytextrank_textrank_scoring.py Projeto: vasulakkaraju/fabric8-analytics-stack-analysis

def execute_stage_one(path_stage0):
    path_stage1 = path_stage0.split('::')[0] + ".stage1.output.dat"
    with open(os.path.join(PATH_PREFIX, path_stage1), 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(
                                         os.path.join(PATH_PREFIX, path_stage0))):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
    return path_stage1

Exemplo n.º 4

0

Exibir arquivo

Arquivo: textrank.py Projeto: ESHackathon/paperweight-python

def text_rank(json_request):
	pattern = re.compile("TI  - (.*?)\\r|AB  - (.*?)\\r")
	matches = re.findall(pattern, json_request['ris'])
	all_inputs = []
	for section in matches:
	       all_inputs.append((''.join([word + ' ' for word in section])).strip())

	input_json = {}
	input_json['id'] = "0"
	input_json['text'] = '.'.join(all_inputs)

	with open('ris_extracted.json', 'w') as output:
	    json.dump(input_json, output)

	with open(path_stage1, 'w') as f:
	    for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')):
	        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

	graph, ranks = pytextrank.text_rank(path_stage1)

	with open(path_stage2, 'w') as f:
	    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
	        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

	phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)])

	return phrases

Exemplo n.º 5

0

Exibir arquivo

def obj_to_keywords(obj):
    if isinstance(obj, list) == False:
        obj = [obj]
    graphs = list(map(lambda x: x, pytextrank.parse_doc(obj)))
    dicts = list(map(lambda x: x._asdict(), graphs))
    graph, ranks = text_rank(path_stage1)
    keywords = list(map(lambda x: x._asdict(), pytextrank.normalize_key_phrases(dicts, ranks)))
    return keywords

Exemplo n.º 6

0

Exibir arquivo

Arquivo: textsummarizer.py Projeto: amcodec1/Simple-Sentiment-Analysis-ML-WebApp

    def summarize(self, _id, content_text, word_limit):
        self.logger.log("_id: " + _id)
        self.logger.log("word_limit: " + str(word_limit))

        # File names
        path_stage0 = 'process/' + _id + '.json'
        path_stage1 = 'process/' + _id + '_o1.json'
        path_stage2 = 'process/' + _id + '_o2.json'
        path_stage3 = 'process/' + _id + '_o3.json'
        path_stage4 = 'process/' + _id + '_o4.json'

        # Create input file
        with open(path_stage0, 'w') as outfile:
            json.dump({"id": "123", "text": content_text}, outfile)

        # Statistical Parsing - Stage 1
        # Perform statistical parsing/tagging on a document in JSON format
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(
                    pytextrank.json_iter(path_stage0)):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        # Ranked Keyphrases - Stage 2
        # Collect and normalize the key phrases from a parsed document
        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        # Extractive Summarization -  Stage 3
        # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank
        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")

        # Final Output - Stage 4
        # Summarize a document based on most significant sentences and key phrases
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=12)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=word_limit),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)

        return {'excerpts': graf_text, 'keywords': phrases}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: netx_conv_autoenc_predictor.py Projeto: ella-yw/PyschStat

def pred_net(sample_case):

    import numpy as np, keras
    from pathlib import Path
    from spacy import displacy
    from PIL import Image
    import json, pytextrank, networkx as nx
    import matplotlib.pyplot as plt

    path_stage0 = "o0.json"
    path_stage1 = "o1.json"

    file_dic = {"id": 0, "text": sample_case}
    loaded_file_dic = json.loads(json.dumps(file_dic))

    with open(path_stage0, 'w') as outfile:
        json.dump(loaded_file_dic, outfile)

    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            print(pytextrank.pretty_print(graf._asdict()))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    nx.draw(graph, with_labels=True)
    plt.savefig("sample_case.png", dpi=200, format='png', bbox_inches='tight')
    plt.close()

    im = Image.open("sample_case.png").convert('L').resize((300, 200))
    sample_image = np.array([np.array(im)])
    sample_image = sample_image.reshape(sample_image.shape[0],
                                        sample_image.shape[1],
                                        sample_image.shape[2], 1)

    model = keras.models.load_model("graph_conv_autoencoder.hdf5")

    y_pred = model.predict(sample_image)
    labels = [
        'Major Depressive Disorder',
        'Attention Deficit Hyperactivity Disorder',
        'Oppositional Defiant Disorder', 'Conduct Disorder',
        'Pervasive Developmental Disorder',
        'Intellectual Disability (Mental Retardation)', 'Psychotic Disorder',
        'Adjustment Disorder', 'Mood Disorder', 'General Anxiety Disorder',
        'Social Anxiety Disorder', 'Seasonal Affective Disorder',
        'Substance Abuse', 'Autism Spectrum Disorder'
    ]

    max1 = labels[np.argmax(y_pred)]

    with open('external_resources.json') as data_file:
        for v in json.load(data_file):
            if v['diagnosis'] == max1:
                about1, treatment1 = v['about'], v['treatment']

    return (max1, about1, treatment1)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: relevant_phrases.py Projeto: epistemonikos/platanario

 def generate_graph(self):
     """Generates a text graph for each sentence"""
     graphs = []
     for graf in pytextrank.parse_doc([{
             "id": generate_id(),
             "text": text
     } for text in self.texts]):
         graphs.append(graf._asdict())
     return graphs

Exemplo n.º 9

0

Exibir arquivo

Arquivo: txtSummary.py Projeto: kinshuk330/text-summarization-comparison

def pytrankSummarize(filename):
    """
    This is another TextRank algorithm. It works in four stages, each feeding its output to the next
    1. Part-of-Speech Tagging and lemmatization are performed for every sentence in the document.
    2. Key phrases are extracted along with their counts, and are normalized.
    3. Calculates a score for each sentence by approximating jaccard distance between the sentence and key phrases.
    4. Summarizes the document based on most significant sentences and key phrases.
    """

    import pytextrank

    jsonText = createJSON(filename)

    path_stage0 = jsonText
    path_stage1 = "o1.json"

    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    path_stage2 = "o2.json"

    graph, ranks = pytextrank.text_rank(path_stage1)

    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    path_stage3 = "o3.json"

    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=50),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)

    print("")
    print("####### From PyTextRank #######")
    print("**excerpts:** %s\n\n**keywords:** %s" % (
        graf_text,
        phrases,
    ))

Exemplo n.º 10

0

Exibir arquivo

    def perform_statistical_parsing_tagging(self, text_file, paragraph_output):
        """
            Perform statistical parsing and tagging of
            sentences in the text (aka JSON document)
        """

        with open(paragraph_output, 'w') as temp_file:
            for paragraph in pytextrank.parse_doc(pytextrank.json_iter(text_file)):
                temp_file.write("%s\n" % pytextrank.pretty_print(paragraph._asdict()))

Exemplo n.º 11

0

Exibir arquivo

def summarize_text(input_file):
    # seriously f**k this API
    path_stage0 = input_file
    path_stage1 = 'stage1.txt'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(graf))

    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    path_stage2 = 'stage2.txt'
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
            # to view output in this notebook
            #print(pytextrank.pretty_print(rl))

    path_stage3 = 'stage3.txt'
    kernel = pytextrank.rank_kernel(path_stage2)

    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
            # to view output in this notebook
            #print(pytextrank.pretty_print(s._asdict()))

    phrases = ", ".join(
        set([
            p
            for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12)
        ]))
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120),
                       key=lambda x: x[1])
    s = []

    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))

    graf_text = " ".join(s)
    #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))

    return ' '.join(s)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: summariser_pytextrank.py Projeto: zunayeed/awesome-ai-ml-dl

    def perform_statistical_parsing_tagging(self, text_file, paragraph_output):
        """
            Perform statistical parsing and tagging of
            sentences in the text (aka JSON document)

            Parameters
            ==========
            text_file:
               file containing the input text (as JSON) to perform
               statistical parsing and tagging on
            paragraph_output:
               output file into which the results are written (as a JSON file)

            Return
            ======
            Nothing, writes results to a text file (as JSON)
        """
        with open(paragraph_output, 'w') as temp_file:
            for paragraph in pytextrank.parse_doc(
                    pytextrank.json_iter(text_file)):
                temp_file.write("%s\n" %
                                pytextrank.pretty_print(paragraph._asdict()))

Exemplo n.º 13

0

Exibir arquivo

def text_ranking(video_seg_id, book_segment):
    """
    :param book_segment: book segment in json format
    :return: key sentences and key phrases
    """
    # os.chdir(video_path)
    # creating directory to store segments for clean structure
    if not os.path.exists('TextRank_data'):
        os.mkdir('TextRank_data')
    if not os.path.exists('TextRank_data/seg' + str(video_seg_id)):
        os.mkdir('TextRank_data/seg' + str(video_seg_id))
    subdir = 'TextRank_data/seg' + str(video_seg_id) + '/'
    path_stage1 = subdir + "stage1.json"
    path_stage2 = subdir + "stage2_key_ph.json"
    path_stage3 = subdir + "stage3_imp_sent.json"

    """Perform statistical parsing/tagging on a document in JSON format"""
    parse_book_seg = pytextrank.parse_doc([book_segment])
    with open(path_stage1, 'w') as f:
        for graf in parse_book_seg:
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    graph, ranks = pytextrank.text_rank(path_stage1)
    """Collect and normalize the key phrases from a parsed document"""

    key_phrases = list(pytextrank.normalize_key_phrases(path_stage1, ranks))
    with open(path_stage2, 'w') as f:
        for rl in key_phrases:
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

    kernel = pytextrank.rank_kernel(path_stage2)
    """Calculate a significance weight for each sentence, 
    using MinHash to approximate a Jaccard distance from key phrases determined by TextRank"""
    key_sentences = list(pytextrank.top_sentences(kernel, path_stage1))
    with open(path_stage3, 'w') as f:
        for s in key_sentences:
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
    return key_sentences, key_phrases

Exemplo n.º 14

0

Exibir arquivo

def _get_keywords(path_stage0, path_stage2):
    # Stage 1: parse doc
    path_stage1 = 'o1.json'
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

    # Stage 2: rank words
    graph, ranks = pytextrank.text_rank(path_stage1)
    pytextrank.render_ranks(graph, ranks)

    result_dict = dict()
    with open(path_stage2, 'w') as f2:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            _ro = rl._asdict()
            ro = dict()
            ro[_ro['text']] = _ro['rank']
            #f2.write("%s\n" % pytextrank.pretty_print(ro))

            result_dict[_ro['text']] = _ro['rank']

    return result_dict

Exemplo n.º 15

0

Exibir arquivo

Arquivo: TextRanking.py Projeto: TeoZosa/CSE258

def stage_1_multiprocess(args):
    publisher = args[0]
    version = args[1]
    book = args[2]
    reviews = args[3]
    out_file_name = "{publisher}_{version}_{asin}.Stage1".format(
        version=version, asin=book, publisher=publisher)
    stage_1_directory = os.path.join(directory, "Stage1Results")

    save_directory = os.path.join(stage_1_directory, out_file_name)
    if os.path.isfile(save_directory):  # already there
        return None
    print(book)
    fake_json = [{
        'id': user,
        'text': review_text
    } for user, asin, title, review_text, timestamp in reviews]
    # pprint.pprint(books[book][0])
    fake_json_graph_dicts = []
    for graf in pytextrank.parse_doc(fake_json):
        graph_dict = graf._asdict()
        fake_json_graph_dicts.append([graph_dict])
    pickle.dump(fake_json_graph_dicts, open(save_directory, 'wb'))

Exemplo n.º 16

0

Exibir arquivo

def retrieveSentences(content, word_limit):
    currpath = os.getcwd()
    folder = os.path.join(currpath, str(uuid.uuid4()))
    os.mkdir(folder)
    fname = str(uuid.uuid4())
    with open("{0}/{1}.json".format(folder, fname), "w") as f:
        f.write(json.dumps({"id": fname, "text": content}))
        f.close()
    path_stage0 = "{0}/{1}.json".format(folder, fname)
    path_stage1 = "{0}/o1.json".format(folder)
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        f.close()
    path_stage2 = "{0}/o2.json".format(folder)
    graph, ranks = pytextrank.text_rank(path_stage1)
    #pytextrank.render_ranks(graph, ranks)
    with open(path_stage2, 'w') as f:
        for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
            f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        f.close()
    kernel = pytextrank.rank_kernel(path_stage2)
    path_stage3 = "{0}/o3.json".format(folder)
    with open(path_stage3, 'w') as f:
        for s in pytextrank.top_sentences(kernel, path_stage1):
            f.write(pytextrank.pretty_print(s._asdict()))
            f.write("\n")
        f.close()
    sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                  word_limit=word_limit),
                       key=lambda x: x[1])
    s = []
    for sent_text, idx in sent_iter:
        s.append(pytextrank.make_sentence(sent_text))
    graf_text = " ".join(s)
    shutil.rmtree(folder)
    return s

Exemplo n.º 17

0

Exibir arquivo

def rank_bill(bill):
    bill_id = bill['bill_id']
    with open(prefix + '/{}_stage1'.format(bill_id), 'w') as f:
        for graf in parse_doc([bill]):
            f.write(pretty_print(graf._asdict()))
            f.write('\n')

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)

    graph, ranks = text_rank(path_stage1)
    render_ranks(graph, ranks)

    for rl in normalize_key_phrases(path_stage1, ranks):
        output = pretty_print(rl._asdict())
        with open(prefix + '/{}_stage2'.format(bill_id), 'w') as f:
            f.write(output)

    path_stage1 = prefix + '/{}_stage1'.format(bill_id)
    path_stage2 = prefix + '/{}_stage2'.format(bill_id)

    kernel = rank_kernel(path_stage2)
    with open(prefix + '/{}_stage3'.format(bill_id), 'w') as f:
        for s in top_sentences(kernel, path_stage1):
            f.write(pretty_print(s._asdict()))

Exemplo n.º 18

0

Exibir arquivo

Arquivo: nlp.py Projeto: exfuture/iot-sec-attack-circuits

def do_pytextrank(data):
    for item in data:
        for subItem in data[item]:
            print('###############')
            print('description:', subItem['description'])

            # using pytextrank
            # reference https://github.com/ceteri/pytextrank/issues/18

            # raw input
            subItemJSON = {'id': subItem['id'], 'text': subItem['description']}
            subItemJSON = json.dumps(subItemJSON)
            with open('sub_item.json', 'w') as outFile:
                outFile.write(subItemJSON)

            # stage 1
            with open('stage1_output.json', 'w') as outFile:
                for graf in pytextrank.parse_doc(
                        pytextrank.json_iter('sub_item.json')):
                    outFile.write("%s\n" %
                                  pytextrank.pretty_print(graf._asdict()))

            # stage 2
            graph, ranks = pytextrank.text_rank('stage1_output.json')
            pytextrank.render_ranks(graph, ranks)
            rlLists = []
            print('key phrases:')
            with open('stage2_output.json', 'w') as outFile:
                for rl in pytextrank.normalize_key_phrases(
                        'stage1_output.json', ranks):
                    rlList = eval(pytextrank.pretty_print(rl))
                    rlLists.append(rlList)
                    print(rlList)

            # cleanup
            os.system(
                'rm -f sub_item.json stage1_output.json stage2_output.json graph.dot'
            )

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' not in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o input:', iOItem)

            # input filter results based on pos
            # this is a heuristic
            filteredRlLists = [x for x in rlLists if 'nn' in x[-2]]
            if (len(filteredRlLists) == 0):
                # invalid case
                continue
            else:
                [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists)
                print('heuristic:', heuristic)
                print('i/o output:', iOItem)

            print('###############')

Exemplo n.º 19

0

Exibir arquivo

Arquivo: MCQ.py Projeto: SPowell82/Automatic-Question-Generator-

    def ParseText(self,text): # Parse Text and Convert to Json
        parse=parse_doc(self.text2json(text))
        parse_list=[json.loads(pretty_print(i._asdict())) for i in parse]
        self.parse_list = parse_list

        return parse_list

Exemplo n.º 20

0

Exibir arquivo

Arquivo: term_extraction.py Projeto: parultripathiDS/NLP-tasks

def extract_phrasesfrom_textrank(corpus):
    record_data = pd.DataFrame({'sentences': corpus})
    record_data = pd.DataFrame({
        'id': record_data.index.tolist(),
        'text': record_data['sentences'].tolist()
    })
    tweet_items = []
    for jdict in record_data.to_dict(orient='records'):
        tweet_items.append(jdict)

    new_df_tweet = pd.DataFrame(columns=['text', 'keywords'])
    path_stage1 = "celebrity1_tweet.json"
    path_stage2 = "celebrity2_tweet.json"
    path_stage3 = "celebrity3_tweet.json"
    for item in tweet_items:
        items_new = [item]
        with open(path_stage1, 'w') as f:
            for graf in pytextrank.parse_doc(items_new):
                f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

        graph, ranks = pytextrank.text_rank(path_stage1)
        pytextrank.render_ranks(graph, ranks)

        with open(path_stage2, 'w') as f:
            for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
                f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))

        kernel = pytextrank.rank_kernel(path_stage2)

        with open(path_stage3, 'w') as f:
            for s in pytextrank.top_sentences(kernel, path_stage1):
                f.write(pytextrank.pretty_print(s._asdict()))
                f.write("\n")
        phrases = ", ".join(
            set([
                p for p in pytextrank.limit_keyphrases(path_stage2,
                                                       phrase_limit=5)
            ]))
        sent_iter = sorted(pytextrank.limit_sentences(path_stage3,
                                                      word_limit=150),
                           key=lambda x: x[1])
        s = []

        for sent_text, idx in sent_iter:
            s.append(pytextrank.make_sentence(sent_text))

        graf_text = " ".join(s)
        new_df_tweet = new_df_tweet.append(
            {
                'text': item.get('text'),
                'keywords': phrases
            }, ignore_index=True)

    celeb_list = [
        'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper',
        'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood',
        'clint', 'eastwood'
    ]

    cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords'])
    for index, row in new_df_tweet.iterrows():
        if any(celeb in row['keywords'] for celeb in celeb_list):
            cleaned_df_tweet = cleaned_df_tweet.append(
                {
                    'sentences': row['text'],
                    'keywords': row['keywords']
                },
                ignore_index=True)

    cleaned_df_tweet.to_csv(phrase_filepath,
                            sep=',',
                            encoding='utf-8',
                            index=False)
    new_df_tweet.to_csv(all_phrasefile_path,
                        sep=',',
                        encoding='utf-8',
                        index=False)
    return new_df_tweet, cleaned_df_tweet

Exemplo n.º 21

0

Exibir arquivo

Arquivo: stage1.py Projeto: shiftsayan/nemo

from pytextrank import json_iter, parse_doc, pretty_print
import sys

## Stage 1:
##  * perform statistical parsing/tagging on a document in JSON format
##
## INPUTS: <stage0>
## OUTPUT: JSON format `ParsedGraf(id, sha1, graf)`

if __name__ == "__main__":
    path_stage0 = sys.argv[1]

    for graf in parse_doc(json_iter(path_stage0)):
        print(pretty_print(graf._asdict()))

Exemplo n.º 22

0

Exibir arquivo

def parseSentence():
    with open('temp2.json', 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter('temp1.json')):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

Exemplo n.º 23

0

Exibir arquivo

Arquivo: summariser_pytextrank.py Projeto: zeeva85/awesome-ai-ml-dl

 def perform_statistical_parsing_tagging(self, text_file, paragraph_output):
     with open(paragraph_output, 'w') as f:
         for paragraph in pytextrank.parse_doc(
                 pytextrank.json_iter(text_file)):
             f.write("%s\n" % pytextrank.pretty_print(paragraph._asdict()))

Exemplo n.º 24

0

Exibir arquivo

#!/usr/bin/env python
# encoding: utf-8

from pytextrank import json_iter, parse_doc, pretty_print
import sys

## Stage 1:
##  * perform statistical parsing/tagging on a document in JSON format
##
## INPUTS: <stage0>
## OUTPUT: JSON format `ParsedGraf(id, sha1, graf)`

if __name__ == "__main__":
  path_stage0 = sys.argv[1]

  for graf in parse_doc(json_iter(path_stage0), force_encode=False):
    print(pretty_print(graf._asdict()))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: twitter_spy.py Projeto: dmitrysenkovich/twitter_spy


if os.path.exists(OUTOUT_DIRECTORY):
	shutil.rmtree(OUTOUT_DIRECTORY)
os.makedirs(OUTOUT_DIRECTORY)


print('Saving tweets to json...')
with open(TWEETS_JSON, 'w', encoding='utf8') as outfile:
	json.dump({'id': '777', 'text': '. '.join(user_tweets)}, outfile, ensure_ascii=False)
print('Saving tweets to json - Done')


print('Performing statistical parsing/tagging on tweets...')
with open(STATISTICAL_PARSING_OUTPUT, 'w') as f:
    for graf in pytextrank.parse_doc(pytextrank.json_iter(TWEETS_JSON)):
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
print('Performing statistical parsing/tagging on tweets... - Done')


print('Collect and normalizing the key phrases from the parsed document...')
graph, ranks = pytextrank.text_rank(STATISTICAL_PARSING_OUTPUT)
pytextrank.render_ranks(graph, ranks)
with open(KEY_PHRASES_NORMALIZATION_OUTPUT, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(STATISTICAL_PARSING_OUTPUT, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
print('Collect and normalizing the key phrases from the parsed document... - Done')


print("Summarizing tweets based on key phrases...")
phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(KEY_PHRASES_NORMALIZATION_OUTPUT, phrase_limit=MAX_SUBJECTS_TO_SHOW)]))

Exemplo n.º 26

0

Exibir arquivo

Arquivo: find_core_words.py Projeto: pl8787/transformer

def search_textrank(text):
    text_dict = [{'id': '0', 'text': text}]
    for graf in pytextrank.parse_doc(text_dict):
        print(graf._asdict())

Exemplo n.º 27

0

Exibir arquivo

def stage1(path_stage0, path_stage1):
    #Stage 1
    with open(path_stage1, 'w') as f:
        for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
            f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))

Exemplo n.º 28

0

Exibir arquivo

# Create dictionary to feed into json file

file_dic = {"id": 0, "text": sample_text}
file_dic = json.dumps(file_dic)
loaded_file_dic = json.loads(file_dic)

# Create test.json and feed file_dic into it.
with open('test.json', 'w') as outfile:
    json.dump(loaded_file_dic, outfile)

path_stage0 = "test.json"
path_stage1 = "o1.json"

# Extract keyword using pytextrank
with open(path_stage1, 'w') as f:
    for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)):
        f.write("%s\n" % pytextrank.pretty_print(graf._asdict()))
        #print(pytextrank.pretty_print(graf._asdict()))

path_stage1 = "o1.json"
path_stage2 = "o2.json"

graph, ranks = pytextrank.text_rank(path_stage1)
pytextrank.render_ranks(graph, ranks)

with open(path_stage2, 'w') as f:
    for rl in pytextrank.normalize_key_phrases(path_stage1, ranks):
        f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
        #print(pytextrank.pretty_print(rl))

path_stage1 = "o1.json"

Exemplo n.º 29

0

Exibir arquivo

sometext = 'Compatibility of systems of linear constraints over the set of natural numbers. ' + \
           'Criteria of compatibility of a system of linear Diophantine equations, ' + \
           'strict inequations, and nonstrict inequations are considered. Upper bounds ' + \
           'for components of a minimal set of solutions and algorithms of construction of ' + \
           'minimal generating sets of solutions for all types of systems are given. ' + \
           'These criteria and the corresponding algorithms for constructing a minimal ' + \
           'supporting set of solutions can be used in solving all the considered types ' + \
           'systems and systems of mixed types.'
someothertext = 'Amazon.com, Inc. is located in Seattle, WA and was founded July 5th, 1994 by Jeff Bezos, ' + \
    'allowing customers to buy everything from books to blenders. Seattle is north of Portland and ' + \
    'south of Vancouver, BC. Other notable Seattle - based companies are Starbucks and Boeing.'

docs = [{'text': sometext, 'id': 777}]

grafs = [{'graf': graf.graf} for graf in pytextrank.parse_doc(docs)]
graph, ranks = pytextrank.text_rank(grafs)
rank_list = [
    rl._asdict()
    for rl in pytextrank.normalize_key_phrases(grafs, ranks, skip_ner=False)
]
kernel = pytextrank.rank_kernel(rank_list)
sents = [s._asdict() for s in pytextrank.top_sentences(kernel, grafs)]
phrases = [
    p[0] for p in pytextrank.limit_keyphrases(rank_list, phrase_limit=6)
]

sent_iter = sorted(pytextrank.limit_sentences(sents, word_limit=150),
                   key=lambda x: x[1])
sents = [pytextrank.make_sentence(sent_text) for sent_text, idx in sent_iter]
graf_text = ' '.join(sents)

Exemplo n.º 30

0

Exibir arquivo

        reviews_by_ASIN = defaultdict(list)
        filename = "{publisher}_{version}_UserItemReviews.duplicatesRemoved".format(
            version=version, publisher=publisher)
        final_filename = os.path.join(directory, filename)
        books = pickle.load(open(final_filename, 'rb'))
        fake_json_graph_dicts = []  #stage 1 output
        for book in books:
            texts_pooled = []
            titles_pooled = []
            fake_json = [{
                'id': user,
                'text': review_text
            } for user, asin, title, review_text, timestamp in books[book]]
            print(book)
            # pprint.pprint(books[book][0])
            for graf in pytextrank.parse_doc(fake_json):
                graph_dict = graf._asdict()
                fake_json_graph_dicts.append([graph_dict])
        stage_1_filename = "{publisher}_{version}_textRank.grafDict.".format(
            version=version, publisher=publisher)
        stage_1_out = os.path.join(directory, "TextRankStages",
                                   stage_1_filename)
        pickle.dump(fake_json_graph_dicts, open(stage_1_out, 'wb'))
        graph, ranks = pytextrank.text_rank(fake_json_graph_dicts)
        pytextrank.render_ranks(graph, ranks)
        rl_fake_json = []  #stage 2 output
        for rl in pytextrank.normalize_key_phrases(
                fake_json_graph_dicts, ranks, stopwords=RAKE.SmartStopList()):
            print(pytextrank.pretty_print(rl))
            rl_fake_json.append([rl._asdict()])