def get_words(f, stopwords = set(), lemmatize = None): ''' Generate shingles for a file f ''' words = [] candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatize) for phrases in candidate_keywords: for word in phrases: words.append(word) return words
def shingles_file(f): ''' Function for getting shingles for a file f ''' print("Started getting shingles for file \"{0}\"".format(f.path_txt)) start_time = time.time() if (f.processed == 1): f_tmp = open(f.path_txt, "r") f.text = f_tmp.read() f_tmp.close() candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatizer.get()) for phrases in candidate_keywords: for word in phrases: f.words.append(word) f.shingles = shingles.gen_shingles(f.words) if (len(f.shingles) == 0): print("No shingles were built in \"{0}\"".format(f.path_txt)) f.processed = -1 return end_time = time.time() print("Getting shingles for file \"{0}\"\ntook {1:.3f}".format(f.path_txt, end_time - start_time))
''' for f in files: if (input_file_hash == f.hash): print("\nPlagiarize: {0}\nin file: \"{1}\"\ndownloaded from: \"{2}\"".format(1.0, f.path_pdf, f.url)) ''' Started getting words from txts ''' print("\nGetting shingles for txt files") for f in files: if (f.processed == 1): words = [] f_tmp = open(f.path_txt,"r") text = f_tmp.read() f_tmp.close() candidate_keywords = generateCandidateKeywords(text, stopwords, lemmatizer) for sublist in candidate_keywords: for word in sublist: words.append(word) f.words = words f.shingles = shingles.gen_shingles(words) if (len(f.shingles) == 0): print("\nNo shingles were built in file \"{0}\"\nPossible, a file is empty - PDF parse error\n".format(f.path_txt)) f.processed = -1 """ Work with input file """ print("Started work with main file") input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf"
#in1 = open(sys.argv[1]) text1 = in1.read() in1.close() in2 = open("./TMP/file1.pdf.txt", "r") #in2 = open(sys.argv[2]) text2 = in2.read() in2.close() from split import get_list from keywords import generateCandidateKeywords words = [] stopwords = set() lemmatizer = None candidate_keywords = generateCandidateKeywords(text1, stopwords, lemmatizer) for sublist in candidate_keywords: for word in sublist: words.append(word) shingles1 = gen_shingles(words) print(len(shingles1)) exit(0) candidate_keywords = generateCandidateKeywords(text2, stopwords, lemmatizer) for sublist in candidate_keywords: for word in sublist: words.append(word) shingles2 = gen_shingles(words) print(len(shingles2)) print(compare(shingles2, shingles1))
EXTRA_TIME -= time() print("Getting keyword phrases took {0:.3f}".format(end_time - start_time), "seconds", end = "\n\n") if (output_file.path_txt != None): out = open(output_file.path_txt, "w") for key in keywords: out.write("{0}\n".format(key[0])) out.close() ''' Started getting shingles for an input file ''' print("Started getting shingles for input file") start_time = time() input_file.words = [] candidate_keywords = generateCandidateKeywords(input_file.text, stopwords, lemmatizer = lemmatizer.get()) for phrases in candidate_keywords: for word in phrases: input_file.words.append(word) input_file.shingles = shingles.gen_shingles(input_file.words) end_time = time() if (len(input_file.shingles) == 0): input_file.processed = -1 print("No shingles were built in input file") exit(0) print("Shingles were built successfully for an input file", end = "\n\n") ''' Starting building query for Google, get urls for request '''