示例#1
0
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):
    for example in sub_stories:
        file_name = os.path.join(save_path, example[1])
        start_time = time.time()
        raw_sents = re.split(" . ", example[0])
        #remove too short sentences
        df = pd.DataFrame(raw_sents, columns =['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) < 5:
            continue

        preprocessed_sentences = []
        for raw_sent in raw_sentences:
            preprocessed_sent = preprocess_raw_sent(raw_sent)
            preprocessed_sentences.append(preprocessed_sent)


        if len(preprocessed_sentences) < 10:
            solution_for_exception(raw_sentences, file_name)
        title = preprocessed_sentences[0]

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(preprocessed_sentences)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        list_sentences_frequencies = dense.tolist()
        # df_tfidf = pd.DataFrame(list_sentences_frequencies, columns=feature_names)
        title_vector = list_sentences_frequencies[0]

        #tfidf for document and abstract
        document = [(" ").join(preprocessed_sentences)]
        vector_doc = vectorizer.fit_transform(document)
        dense_doc = vector_doc.todense()
        document_vector = dense_doc.tolist()[0]
        
        
        number_of_nouns = count_noun(raw_sentences, option= True)
        simWithTitle = sim_with_title(list_sentences_frequencies, title_vector)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = sim_with_doc(list_sentences_frequencies, document_vector)
        NUM_PICKED_SENTS = 4
        print("Done preprocessing!")
        print('time for processing', time.time() - start_time)

            
        Solver = Summerizer(title, preprocessed_sentences, raw_sentences, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS, simWithTitle, simWithDoc, sim2sents, number_of_nouns, order_params)
        best_individual = Solver.solve()
                 

        print(file_name)
        if best_individual is None:
            solution_for_exception(raw_sentences, file_name)     
        else:
            print(best_individual)
            Solver.show(best_individual, file_name)
示例#2
0
def sim_with_title_of_paragraph(document):
    paragraphs = document.split('\n')
    sim_header = []
    raw_sents = []
    preprocessed_sents = []
    for para in paragraphs:
        raw = para.split(' . ')
        df = pd.DataFrame(raw, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        newdf['preprocessed_raw'] = df['preprocess_raw'].apply(
            lambda x: preprocess_raw_sent(x))

        raw_sentences = newdf['preprocess_raw'].values.tolist()
        preprocessed_sentences = newdf['preprocessed_raw'].values.tolist()

        # preprocessed_sentences = []
        # for raw_sent in raw_sentences:
        #     preprocessed_sent = preprocess_raw_sent(raw_sent)
        #     preprocessed_sentences.append(preprocessed_sent)

        raw_sents.extend(raw_sentences)
        preprocessed_sents.extend(preprocessed_sentences)

        #similar with header of paragraph
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(preprocessed_sentences)
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        df_tfidf = pd.DataFrame(denselist, columns=feature_names)
        simWithTitle = sim_with_title(denselist, denselist[0])
        sim_header.extend(simWithTitle)
        del df
        del newdf

    return raw_sents, preprocessed_sents, sim_header
示例#3
0
def main():
    # Setting Variables
    POPU_SIZE = 30
    MAX_GEN = 20
    CROSS_RATE = 0.8
    MUTATE_RATE = 0.4
    NUM_PICKED_SENTS = 4

    directory = 'stories'
    save_path = 'hyp'

    print("Setting: ")
    print("POPULATION SIZE: {}".format(POPU_SIZE))
    print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))
    print("CROSSING RATE: {}".format(CROSS_RATE))
    print("MUTATION SIZE: {}".format(MUTATE_RATE))

    # list of documents
    stories = load_docs(directory)
    start_time = time.time()
    for example in stories:
        try:
            raw_sents = example[0].split(" . ")
            print("Preprocessing ", example[1])
            sentences = []
            sentences_for_NNP = []

            df = pd.DataFrame(raw_sents, columns=['raw'])
            df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
            newdf = df.loc[(df['preprocess_raw'] != 'None')]
            raw_sentences = newdf['preprocess_raw'].values.tolist()

            for raw_sent in raw_sentences:
                sent = preprocess_raw_sent(raw_sent)
                sent_tmp = preprocess_numberOfNNP(raw_sent)
                # print(f'time-preprocess_numberOfNNP = {time.time() - time_2} s')
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)

            title_raw = raw_sentences[0]
            title = preprocess_raw_sent(title_raw)
            number_of_nouns = count_noun(sentences_for_NNP)

            simWithTitle = sim_with_title(sentences, title)
            sim2sents = sim_2_sent(sentences)
            simWithDoc = []
            for sent in sentences:
                simWithDoc.append(sim_with_doc(sent, sentences))

            print("Done preprocessing!")
            # DONE!

            Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                                MAX_GEN, CROSS_RATE, MUTATE_RATE,
                                NUM_PICKED_SENTS, simWithTitle, simWithDoc,
                                sim2sents, number_of_nouns)
            best_individual = Solver.PSO()
            file_name = os.path.join(save_path, example[1])

            if best_individual is None:
                print('No solution.')
            else:
                print(file_name)
                print(best_individual)
                Solver.show(best_individual, file_name)
        except Exception as e:
            print(example[1])
            print("type error: " + str(e))

    print("--- %s mins ---" % ((time.time() - start_time) /
                               (60.0 * len(stories))))
示例#4
0
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params, scheme):
    for example in sub_stories:
        start_time = time.time()
        raw_sents = re.split("\n", example[0])

        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        title_raw = raw_sentences[0]

        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        # POPU_SIZE = 40
        if len(sentences) < 20:
            MAX_GEN = 20
        elif len(sentences) < 50:
            MAX_GEN = 50
        else:
            MAX_GEN = 80

        print("POPULATION SIZE: {}".format(POPU_SIZE))
        print("MAX NUMBER OF GENERATIONS: {}".format(MAX_GEN))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            NUM_PICKED_SENTS = 4

        MinLT = 1
        MaxLT = 7

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params, MinLT, MaxLT,
                            scheme)
        best_individual = Solver.solve()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)
示例#5
0
def start_run(processID, POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE,
              sub_stories, save_path, order_params):
    # def start_run(POPU_SIZE, MAX_GEN, CROSS_RATE, MUTATE_RATE, sub_stories, save_path, order_params):

    for example in sub_stories:

        start_time = time.time()
        # raw_sentences = re.split("\n\s+", example[0])
        raw_sents = re.split("\n", example[0])
        df = pd.DataFrame(raw_sents, columns=['raw'])
        df['preprocess_raw'] = df['raw'].apply(lambda x: clean_text(x))
        newdf = df.loc[(df['preprocess_raw'] != 'None')]
        raw_sentences = newdf['preprocess_raw'].values.tolist()
        if len(raw_sentences) == 0:
            continue

        # print('raw', len(raw_sentences), stories.index(example))
        title_raw = raw_sentences[0]
        # Preprocessing
        # print("Preprocessing...")
        sentences = []
        sentences_for_NNP = []
        for raw_sent in raw_sentences:
            sent = preprocess_raw_sent(raw_sent)
            # sent_tmp = preprocess_numberOfNNP(raw_sent)

            sent_tmp = preprocess_raw_sent(raw_sent, True)
            if len(sent.split(' ')) < 2:
                raw_sentences.remove(raw_sent)
            else:
                sentences.append(sent)
                sentences_for_NNP.append(sent_tmp)
        title = preprocess_raw_sent(title_raw)
        list_sentences_frequencies = word_frequencies(sentences, title)
        number_of_nouns = count_noun(sentences_for_NNP)
        simWithTitle = sim_with_title(list_sentences_frequencies)
        sim2sents = sim_2_sent(list_sentences_frequencies)
        simWithDoc = []
        # for sent in sentences:
        for i in range(len(sentences)):
            simWithDoc.append(
                sim_with_doc(list_sentences_frequencies, index_sentence=i))

        print("Done preprocessing!")
        # DONE!
        print('time for processing', time.time() - start_time)
        if len(sentences) < 4:
            NUM_PICKED_SENTS = len(sentences)
        else:
            #            NUM_PICKED_SENTS = x
            #       NUM_PICKED_SENTS = int(len(sentences)*0.2)
            NUM_PICKED_SENTS = 4

        Solver = Summerizer(title, sentences, raw_sentences, POPU_SIZE,
                            MAX_GEN, CROSS_RATE, MUTATE_RATE, NUM_PICKED_SENTS,
                            simWithTitle, simWithDoc, sim2sents,
                            number_of_nouns, order_params)
        best_individual = Solver.PSO()
        file_name = os.path.join(save_path, example[1])

        if best_individual is None:
            print('No solution.')
        else:
            print(file_name)
            print(best_individual)
            Solver.show(best_individual, file_name)