Exemplo n.º 1
0
def combifyData(DF): 
    grouped_df = DF.groupby('Related Terms')
    dataframes = pd.DataFrame(columns=['Term','Heading','Spintext'])  
    #print(DF)
    for key, item in grouped_df:
        relatedTerm = key
        texd = []
        #print(relatedTerm, "\n\n")
        for heading in item["Heading"]:
            texd.append(heading) 
        text = ' '.join(texd)
        #print(text)
        combos = combify(text, 1, stop_words=stopwords)
        average = combos["NUMBER_OF_TIMES_FOUND"].mean() # calculate average
        average = average + 1 # Add 1 to average
        df = combos[combos["NUMBER_OF_TIMES_FOUND"] > average] # Filter value based on criteria
        df= df.sort_values(by="NUMBER_OF_TIMES_FOUND", ascending=False) #Sort in Desc

        orderedCombos = tuple(list(df.index))
        dfObj = createTopic(grouped_df.get_group(key),relatedTerm,orderedCombos) # function call for each (DF for that related term , related term , and particular combo)

        spintaxDF = combifyTopic(dfObj)
        #print (dfObj)
        frames = [dataframes,spintaxDF]
        dataframes = pd.concat(frames)
        # path='Output\\'
        # dfObj.to_csv(path+relatedTerm+'.csv',index=False, encoding='utf-8')
    dataframes.to_csv("Spintax.csv",index=False, encoding='utf-8')
    print(dataframes)
Exemplo n.º 2
0
def combifyTopic(DF):
    #print (DF)
    term = str(DF['Related Term'].iloc[0])
    grouped_topic = DF.groupby('Topic')
    
    fd = pd.DataFrame(columns=['Term','Heading','Spintext'])
    for key, item in grouped_topic:
        dfObj = pd.DataFrame(columns=['Sentence','Word','Heading']) # new everytime
        #print("Topic : ",key)
        Topic = key
        text,headingText = groupFix(grouped_topic.get_group(key),key)
   
        #Calculate Combos for paragraphs
        combos = combify(text, 1, stop_words=stopwords) # returns datafram
        average = combos["NUMBER_OF_TIMES_FOUND"].mean() # calculate average
        average = average + 1 # Add 1 to average
        df = combos[combos["NUMBER_OF_TIMES_FOUND"] > average] # Filter value based on criteria
        df= df.sort_values(by="NUMBER_OF_TIMES_FOUND", ascending=False) #Sort in Desc
        combos = tuple(list(df.index)) # dataframe to list
        
        #print("Combos : ",combos)
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # Get sentences from paragraphs
        sentences = sent_detector.tokenize(text.strip())
        #print("Sentences : ",sentences)
        
        for sent in sentences:
            for word in combos:
                result = findWholeWord(word)(sent)
                #print(findWholeWord(word)(sent))
                #result = findSentences(word.lower(),sent.lower())
                if result is not None:
                    #word,sentence = result
                    dfObj = dfObj.append({'Word':word,'Sentence': sent,'Heading':headingText}, ignore_index=True)
        #print(dfObj)
        if not dfObj.empty:
            dfObj = dfObj.groupby('Sentence').agg({'Word': ', '.join,'Heading':'first'}).reset_index()  #Group by Heading 
            dfObj = dfObj[['Sentence','Word','Heading']] #Re-arranging again
            dt = dfObj.sort_values(by="Word", ascending=False)
            #dt['Word'] = dt['Word'].str.split(',').sort_values()
            dt['Count'] = dt.groupby('Word')['Word'].transform('count')
            dt['Len'] = dt['Sentence'].str.len() # count chracters
            dt= dt.sort_values(by="Count", ascending=False) #Sort in Desc
            dt = dt[dt["Count"] >= 2]
            # path='Output\\'
            # name = path+key+ '('+term+')'
            # filename = "%s.csv" % name
            # dt.to_csv(filename,index=False, encoding='utf-8-sig')
            ####Spintax###
            if not dt.empty:
                response = generateSpintax(dt,combos)
                headingTextforThis = str(dt['Heading'].iloc[0]) 
                headingTextforThis =  ('{' + headingTextforThis + '}')
                fd = fd.append({'Term':term,'Heading':headingTextforThis, 'Spintext':response}, ignore_index=True)
            
    # filename = "%s.csv" % name
    # fd.to_csv(filename,index=False, encoding='utf-8-sig')
    #print(fd) #retruns 1 terms alll topic spintax
    return (fd)
Exemplo n.º 3
0
def all_data(list_of_links):
    dataframes = []
    for link in list_of_links:
        try:
            dataframes.append(smash_and_grab(link))
        except:
            print("failed to grab", link)
    df = pd.concat(dataframes, ignore_index=True)
    print("grabbed data")
    df = df[["url", "header", "first line"]]
    headings = ""
    for head in df["header"]:
        headings += head
    most_words = combify(headings, 1, stopwords)
    most_words.sort_values(by="NUMBER_OF_TIMES_FOUND",
                           inplace=True,
                           ascending=False)
    topics = []
    for i in df.index:
        head_to_check = df.at[i, "header"]
        small_dict = {"header": head_to_check}
        word_list = list(most_words.index)
        for word in word_list:
            if word in head_to_check.lower():
                small_dict["First topic word"] = word
                word_list.pop(word_list.index(word))
                break
        for word in word_list:
            if word in head_to_check.lower():
                small_dict["Second topic word"] = word
                word_list.pop(word_list.index(word))
                break
        if "Second topic word" not in small_dict.keys():
            small_dict["Second topic word"] = ""
        for word in word_list:
            if word in head_to_check.lower():
                small_dict["Third topic word"] = word
                word_list.pop(word_list.index(word))
                break
        if "Third topic word" not in small_dict.keys():
            small_dict["Third topic word"] = ""
        for q in ["how", "what", "who", "where", "when"]:
            if q in head_to_check.lower():
                small_dict["Q word"] = q
        if "Q word" not in small_dict.keys():
            small_dict["Q word"] = ""
        topics.append(small_dict)
    topics_info = pd.DataFrame(topics)
    topics_info = topics_info[[
        "header", "First topic word", "Second topic word", "Third topic word",
        "Q word"
    ]]
    print(topics_info.head(10))
    df = pd.merge(df,
                  topics_info,
                  on="header",
                  left_index=True,
                  right_index=True,
                  how="right")
    print(df.head())
    texd = ""
    for words in df["first line"]:
        words = words.strip()
        texd += words
    combos = combify(texd, 1, stop_words=stopwords, limit=2)
    combos.to_csv("Combo output for thingy for MTD.csv")
    useful_combos = {}
    for combination in combos.index:
        useful_combos[combination] = combos.at[combination,
                                               "NUMBER_OF_TIMES_FOUND"]
    df["BIN"] = "Bin"
    for topics in [
            "First topic word", "Second topic word", "Third topic word"
    ]:
        for i in df.index:
            if topic_binner(df.at[i, topics]):
                df.at[i, "BIN"] = "keep"
    df = df[df["BIN"] == "keep"]
    df = df[[useful_para(useful_combos, x) for x in df["first line"]]]
    df["head topics"] = df["First topic word"] + df["Second topic word"] + df[
        "Third topic word"]
    df.drop(columns="BIN", inplace=True)
    como_list = [x for x in useful_combos.keys()]
    df["score"] = [combo_counter(como_list, x) for x in df["first line"]]
    topic_scores = {}
    for i in df.index:
        head_t = df.at[i, "head topics"]
        score = df.at[i, "score"]
        if head_t in topic_scores:
            if score > topic_scores[head_t]["score"]:
                topic_scores[head_t]["score"] = score
                topic_scores[head_t]["heading"] = df.at[i, "header"]
                topic_scores[head_t]["paragraph"] = df.at[i, "first line"]
        else:
            topic_scores[head_t] = {}
            topic_scores[head_t]["topic"] = head_t
            topic_scores[head_t]["score"] = score
            topic_scores[head_t]["heading"] = df.at[i, "header"]
            topic_scores[head_t]["paragraph"] = df.at[i, "first line"]
    article = pd.DataFrame.from_dict(topic_scores, "index")
    article.replace("\d+", "", True, regex=True)
    article.replace("\£", "", True, regex=True)
    article.sort_values(by="score", inplace=True, ascending=False)
    article = article.head(30)
    article = article.reset_index()
    article.to_csv("Output for thingy for MTD.csv", index=False)
    text = ""
    count = 1
    for each in article["paragraph"]:
        addition = "||" + str(count) + "|| "
        text += addition
        como_list.append(addition)
        count += 1
        text += each + "\n"
    shit_balls = open("shitballs.txt", "w+")
    shit_balls.write(text)
    shit_balls.close()
    shittier_balls = open("shittier balls.txt", "w+")
    response = rewriter.api._transform_plain_text('unique_variation', text,
                                                  como_list, 'high')
    shittier_balls.write(response["response"])
    shittier_balls.close()
    post_text = response["response"]

    text_list = [x for x in post_text.split("||")]
    stored_number = 0
    fin_article = {}
    for line in text_list:
        if len(line.strip()) > 0:
            if number_check(line.strip()):
                stored_number = line
            else:
                fin_article[int(stored_number) - 1] = line
                print(stored_number, line)
    post_data = pd.DataFrame.from_dict(fin_article,
                                       orient='index',
                                       columns=["spun text"])
    output = article.merge(post_data, left_index=True, right_index=True)
    output.to_csv("FINISHeD THING.csv", index=False)