예제 #1
0
def get_words(f, stopwords = set(), lemmatize = None):
    '''
    Generate shingles for a file f
    '''
    words = []
    candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatize)
    for phrases in candidate_keywords:
        for word in phrases:
            words.append(word)
    return words
예제 #2
0
def shingles_file(f):
    '''
    Function for getting shingles for a file f
    '''
    print("Started getting shingles for file \"{0}\"".format(f.path_txt))
    start_time = time.time()
    if (f.processed == 1):
            f_tmp = open(f.path_txt, "r")
            f.text = f_tmp.read()
            f_tmp.close()
            candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatizer.get())
            for phrases in candidate_keywords:
                for word in phrases:
                    f.words.append(word)
            f.shingles = shingles.gen_shingles(f.words)
            if (len(f.shingles) == 0):
                print("No shingles were built in \"{0}\"".format(f.path_txt))
                f.processed = -1
                return
    end_time = time.time()
    print("Getting shingles for file \"{0}\"\ntook {1:.3f}".format(f.path_txt, end_time - start_time))
예제 #3
0
'''
for f in files:
    if (input_file_hash == f.hash):
        print("\nPlagiarize: {0}\nin file: \"{1}\"\ndownloaded from: \"{2}\"".format(1.0, f.path_pdf, f.url))

'''
Started getting words from txts
'''
print("\nGetting shingles for txt files")
for f in files:
    if (f.processed == 1):
        words = []
        f_tmp = open(f.path_txt,"r")
        text = f_tmp.read()
        f_tmp.close()
        candidate_keywords = generateCandidateKeywords(text, stopwords, lemmatizer)
        for sublist in candidate_keywords:
            for word in sublist:
                words.append(word)
        f.words = words
        f.shingles = shingles.gen_shingles(words)
        if (len(f.shingles) == 0):
            print("\nNo shingles were built in file \"{0}\"\nPossible, a file is empty - PDF parse error\n".format(f.path_txt))
            f.processed = -1


"""
Work with input file
"""
print("Started work with main file")
input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf"
예제 #4
0
    #in1 = open(sys.argv[1])
    text1 = in1.read()
    in1.close()
    in2 = open("./TMP/file1.pdf.txt", "r")
    #in2 = open(sys.argv[2])
    text2 = in2.read()
    in2.close()

    from split import get_list
    from keywords import generateCandidateKeywords


    words = []
    stopwords = set()
    lemmatizer = None
    candidate_keywords = generateCandidateKeywords(text1, stopwords, lemmatizer)
    for sublist in candidate_keywords:
        for word in sublist:
            words.append(word)
    shingles1 = gen_shingles(words)
    print(len(shingles1))
    exit(0)
    
    candidate_keywords = generateCandidateKeywords(text2, stopwords, lemmatizer)
    for sublist in candidate_keywords:
        for word in sublist:
            words.append(word)
    shingles2 = gen_shingles(words)
    print(len(shingles2))

    print(compare(shingles2, shingles1))
예제 #5
0
    EXTRA_TIME -= time()
    print("Getting keyword phrases took {0:.3f}".format(end_time - start_time), "seconds", end = "\n\n")
    
    if (output_file.path_txt != None):
        out = open(output_file.path_txt, "w")
        for key in keywords:
            out.write("{0}\n".format(key[0]))
        out.close()

    '''
    Started getting shingles for an input file
    '''
    print("Started getting shingles for input file")
    start_time = time()
    input_file.words = []
    candidate_keywords = generateCandidateKeywords(input_file.text, stopwords, lemmatizer = lemmatizer.get())
    for phrases in candidate_keywords:
        for word in phrases:
            input_file.words.append(word)
    input_file.shingles = shingles.gen_shingles(input_file.words)
    end_time = time()
    if (len(input_file.shingles) == 0):
        input_file.processed = -1
        print("No shingles were built in input file")
        exit(0)
    print("Shingles were built successfully for an input file", end = "\n\n")


    '''
    Starting building query for Google, get urls for request
    '''