def getresult(inputdata):
    model = joblib.load('contextIdentifierBangla.pkl')
    df = pd.read_csv("word_weights.csv")
    distinctwords = df['word']

    rwdata = inputdata
    rwdata = rwdata.splitlines()
    inputdata = tanvir_stemming.prepare_input(inputdata)
    #print(inputdata)
    #print(inputdata)
    testdata = []
    #constructing list of feature vector from input data
    for i in inputdata:
        pg = [0] * 4  #making list of size 4, contins 0 in each cell
        k = 0
        #feature vector for a paragraph
        for j in distinctwords:
            if j in i:
                cnt = i.count(j)
                pg[0] = pg[0] + cnt * df.at[k, 'politics']
                pg[1] = pg[1] + cnt * df.at[k, 'religious']
                pg[2] = pg[2] + cnt * df.at[k, 'sports']
                pg[3] = pg[3] + cnt * df.at[k, 'entertainment']
            k = k + 1
        testdata.append(pg)
    #predict and print result
    res = model.predict(testdata)
    indxrw = 0
    resultes = []  # added for store result dict
    for i in res:
        result = {}
        if i == 100:
            result["paragraph"] = rwdata[indxrw]
            result["category"] = "politics"
            result["summary"] = summarizer.summary('politics', rwdata[indxrw])
        elif i == 200:
            result["paragraph"] = rwdata[indxrw]
            result["category"] = "religious"
            result["summary"] = summarizer.summary('politics', rwdata[indxrw])
        elif i == 300:
            result["paragraph"] = rwdata[indxrw]
            result["category"] = "sports"
            result["summary"] = summarizer.summary('politics', rwdata[indxrw])
        else:
            result["paragraph"] = rwdata[indxrw]
            result["category"] = "entertainment"
            result["summary"] = summarizer.summary('politics', rwdata[indxrw])
        resultes.append(result)
        #print(result)
        indxrw = indxrw + 1
    #print(resultes)
    return {"results": resultes}
예제 #2
0
def get_summary():
    global result

    fetched_content = text_entry.get('1.0', tk.END)
    fetched_content = fetched_content.strip()
    if len(fetched_content) == 0:
        messagebox.showinfo("ERROR", "ENTER TEXT FOR SUMMARIZATION")
    else:
        result = summarizer.summary(fetched_content)
        if len(text_entry1.get('1.0', END)) != 0:
            text_entry1.delete('1.0', END)
        #print(result)
        text_entry1.insert(INSERT, result)  # summary insertion
예제 #3
0
def conclusion():
    global result
    if len(result) == 0:
        result = ""

    full_content = text_entry3.get('1.0', END)
    full_content = full_content.strip()
    if len(full_content) == 0:
        messagebox.showinfo("Warning", "FILE IS EMPTY")
    else:
        result = summarizer.summary(full_content)  #generating the summary
        if len(result_text.get('1.0', END)) != 0:
            result_text.delete('1.0', END)
        result_text.insert(INSERT, result)  #displaying the summary
예제 #4
0
def get_text():
    global result
    result = ""
    full_content = ""
    url_text = url.get(
    )  # get() is used to retrieve the url text from the object- url
    if len(url_text) == 0:
        messagebox.showinfo("Warning", "URL NOT ENTERED")

    else:
        html_doc = request.urlopen(url_text)
        info = bs(html_doc, 'html.parser')  # defining  html parser
        contents = info.findAll(
            'p')  # findAll - finds and retrieves the contents of paragraph tag
        for content in contents:  # identifying only the text
            full_content += content.text

        if len(result) != 0:
            result = ""

        result = summarizer.summary(full_content)
        if len(text_entry2.get('1.0', END)) != 0:
            text_entry2.delete('1.0', END)
        text_entry2.insert(INSERT, result)
import codecs
from SemanticRoleLabelling import semanticRoleLabel
from summarizer import summary

if __name__ == "__main__":

    testFile = codecs.open("testFile.txt", "r", "utf-8")

    ## Read news article
    testText = testFile.read()

    ## Create semanticRoleModeller object
    semantic_role_obj = semanticRoleLabel(inputText=testText)

    ## Calls get_semantic_roles with semanticRoleModeller object {Subject, Actio, Object}
    list_of_events = semantic_role_obj.get_semantic_roles()

    ## Create summary object
    summary_obj = summary()

    ## Calls summary function with summary object
    event_data = summary_obj.calculate_summary(list_of_events)
예제 #6
0
def sumdocs(docs, tokenized_sents, offset, line_cnt, doc_id):
    #docs, tokenized_sents, offset, line_cnt, doc_id = args[0], args[1], args[2], args[3], args[4]
    global config, train2emb, test2emb, top_label_assignment, topics, document_phrase_cnt, inverted_index, OUT, comparative_dict, graph_builder

    start_time = time.time()
    if config['summ_method'] == 'sumdocs':
        # default is KNN search for CATE embedding
        if config[
                'comparative_opt'] == 'knn':  #KNN comparative search, route 0 and route 1
            count = defaultdict(int)
            for doc_agg_emb in test2emb[offset:offset + line_cnt]:
                sim_max = -1
                category = None
                for label in label2emb:
                    sim = 1 - spatial.distance.cosine(doc_agg_emb,
                                                      label2emb[label])
                    if sim > sim_max:
                        sim_max = sim
                        category = label
                count[category] += 1
            #print(count)

            category = max(count.items(), key=operator.itemgetter(1))[0]
            comp_pool = list(
                map(lambda x: x[0], top_label_assignment[category]))
            all_siblings = topics

            twin_docs = list(
                map(lambda x: x[0],
                    top_label_assignment[category][:config['num_siblings']]))
            siblings_docs = [
                list(
                    map(lambda x: x[0],
                        top_label_assignment[l][:config['num_siblings']]))
                for l in all_siblings if l != category
            ]

            comparative_docs = summarizer.compare(config,
                                                  None,
                                                  None,
                                                  None,
                                                  test2emb[offset:offset +
                                                           line_cnt],
                                                  train2emb,
                                                  skip_doc=None,
                                                  contain_doc=comp_pool)
            phrase_scores = summarizer.summary(config,
                                               docs,
                                               siblings_docs,
                                               twin_docs,
                                               comparative_docs,
                                               document_phrase_cnt,
                                               inverted_index,
                                               graph_builder=graph_builder)

    elif config['summ_method'] == 'sumdocs_wo_twins':
        count = defaultdict(int)
        for doc_agg_emb in test2emb[offset:offset + line_cnt]:
            sim_max = -1
            category = None
            for label in label2emb:
                sim = 1 - spatial.distance.cosine(doc_agg_emb,
                                                  label2emb[label])
                if sim > sim_max:
                    sim_max = sim
                    category = label
            count[category] += 1

        category = max(count.items(), key=operator.itemgetter(1))[0]
        comp_pool = list(map(lambda x: x[0], top_label_assignment[category]))
        all_siblings = topics

        # changes: 1010
        twin_docs = list(
            map(lambda x: x[0],
                top_label_assignment[category][:config['num_siblings']]))
        siblings_docs = [
            list(
                map(lambda x: x[0],
                    top_label_assignment[l][:config['num_siblings']]))
            for l in all_siblings if l != category
        ]
        category, comparative_docs = '', []
        phrase_scores = summarizer.summary(config,
                                           docs,
                                           siblings_docs,
                                           twin_docs,
                                           None,
                                           document_phrase_cnt,
                                           inverted_index,
                                           graph_builder=graph_builder)
    elif config['summ_method'] == 'sumdocs_textrank':
        category, comparative_docs = '', []
        phrase_scores = summarizer.summary(config,
                                           docs,
                                           None,
                                           None,
                                           None,
                                           document_phrase_cnt,
                                           inverted_index,
                                           graph_builder=graph_builder)
    elif config['summ_method'] == 'graph_degen':
        category, comparative_docs = '', []
        phrase_scores = graphdegen(docs)
    else:
        assert True == False
    #
    #print(time.time() - start_time)
    #start_time = time.time()

    mmr_selector(tokenized_sents,
                 phrase_scores,
                 doc_id,
                 OUT,
                 limits=config['word_limits'])
    return category, comparative_docs
    #return category, comparative_docs
    return
예제 #7
0
def getTimeline(screen_name):
    today = datetime.datetime.now()
    DD = datetime.timedelta(days=7)
    earlier = today - DD
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords')
    auth = authenticate()
    api = tweepy.API(auth, wait_on_rate_limit=True)
    all_tweets = []
    count = 1
    truth_value_sum = 0
    try:
        new_tweets = api.user_timeline(screen_name=screen_name,
                                       count=1000,
                                       result_type='recent',
                                       tweet_mode='extended',
                                       since=earlier.strftime("%Y-%m-%d"))

        for tweet in new_tweets:
            created_at = str(tweet.created_at).split(' ')
            date = created_at[0].split('-')
            time = created_at[1].split(':')
            text = re.sub(r'http\S+', '', tweet.full_text)
            # print(date[0] , date[1] , date[2])
            # print(time[0],time[1],time[2])
            #print(re.sub(r'http\S+', '', tweet.full_text),tweet.created_at,tweet.user.screen_name,)
            create_at = {}
            create_at['yyyy'] = int(date[0])
            create_at['mm'] = int(date[1])
            create_at['dd'] = int(date[2])
            create_at['hh'] = int(time[0])
            create_at['min'] = int(time[1])
            create_at['ss'] = int(time[2])
            t_value = truthfullness.get_value(text)
            if (t_value == -99):
                ret_tweet = {}
                ret_tweet['error'] = 'Model file not found'
                all_tweets.append(ret_tweet)
                json_data = json.dumps(ret_tweet, ensure_ascii=False)
                print('Error:Model file not found')
                return json_data

            truth_value_sum += t_value
            count += 1
            ret_tweet = {}

            about = summarizer.summary(text)
            if (len(about) <= 0):
                about.append('Nothing')

            ret_tweet['text'] = re.sub('\n', ' ', tweet.full_text)
            ret_tweet['user_name'] = tweet.user.screen_name
            ret_tweet['truth_score'] = t_value
            ret_tweet['about'] = about
            ret_tweet['create_at'] = create_at
            ret_tweet[
                'tweet_url'] = 'https://twitter.com/statuses/' + tweet.id_str
            all_tweets.append(ret_tweet)
    except Exception:
        ret_tweet = {}
        ret_tweet['error'] = 'Error, please enter valied screen name'
        all_tweets.append(ret_tweet)

    #print(len(all_tweets))
    ret_tweets = {}
    ret_tweets['avg_t_value'] = float(truth_value_sum) / count
    ret_tweets['tweet_list'] = all_tweets

    json_data = json.dumps(ret_tweets, ensure_ascii=False)
    #print(json_data)
    return json_data