def shingles_file(f): ''' Function for getting shingles for a file f ''' print("Started getting shingles for file \"{0}\"".format(f.path_txt)) start_time = time.time() if (f.processed == 1): f_tmp = open(f.path_txt, "r") f.text = f_tmp.read() f_tmp.close() candidate_keywords = generateCandidateKeywords(f.text, stopwords, lemmatizer = lemmatizer.get()) for phrases in candidate_keywords: for word in phrases: f.words.append(word) f.shingles = shingles.gen_shingles(f.words) if (len(f.shingles) == 0): print("No shingles were built in \"{0}\"".format(f.path_txt)) f.processed = -1 return end_time = time.time() print("Getting shingles for file \"{0}\"\ntook {1:.3f}".format(f.path_txt, end_time - start_time))
def run(self): """ Downloading files """ try: threadLock.acquire() print("{0}) Downloading \"{1}\"".format(self._id, self._file.url), end = "\n\n") threadLock.release() self._file.path = download.download(self._file.url, directory = DOWNLOAD_DIR, callback = None) if (self._file.path == ""): threadLock.acquire() print("Error, during downloading (server is not corresponding)") print("\"{0}\"".format(self._file.url), end = "\n\n") threadLock.release() self._file.appropriate = False self._return = None return except: threadLock.acquire() print("Download \"{0}\" failed".format(self._file.url), end = "\n\n") threadLock.release() exit(0) else: threadLock.acquire() print("File \"{0}\"".format(self._file.url)) print("Saved in \'{0}\'".format(self._file.path), end = "\n\n") threadLock.release() """ Checking hash sum for coinsidence """ self._file.makehash() if (self._file.hash == self._compare_file.hash): self._file.similarity = "HASH_COINSIDENCE" self._return = self._file ''' You can delete it ''' threadLock.acquire() print('////////////////////////////////////////////') print("Similarity with file") print("\'{0}\'".format(self._file.url)) print("is") print("HASH_COINSIDENCE") print('////////////////////////////////////////////', end = "\n\n") threadLock.release() return """ Parsing PDFs into TXTs """ self._file.path_txt = self._file.path + ".txt" threadLock.acquire() start_time = time() print("Started parsing \"{0}\"".format(self._file.path), end = "\n\n") threadLock.release() parse_result = parse_pdf(self._file) end_time = time() threadLock.acquire() print("Parsing \"{0}\" took {1:.3f}".format(self._file.path, end_time - start_time), end = "\n\n") threadLock.release() if (self._file.appropriate == False): self._return = None return threadLock.acquire() print("Started getting shingles for file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() start_time = time() self._file.text = get_text(self._file) self._file.words = get_words(self._file, self._stopwords) self._file.shingles = shingles.gen_shingles(self._file.words) end_time = time() if len(self._file.shingles) == 0: threadLock.acquire() print("No shingles were built in file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() self._return = None return threadLock.acquire() print("Shingles were built successfully for an file \"{0}\"".format(self._file.path_txt)) print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n") threadLock.release() """ Comparing, files, using shingles """ threadLock.acquire() print("Started getting similarity based on shingles in file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() start_time = time() self._file.similarity = shingles.compare(self._compare_file.shingles, self._file.shingles) end_time = time() threadLock.acquire() print("Ended comparing shingles for file \"{0}\"".format(self._file.path_txt)) print("Took {0:.3f}s".format(end_time - start_time), end = "\n\n") threadLock.release() threadLock.acquire() print('////////////////////////////////////////////') print("Similarity with file") print("\'{0}\'".format(self._file.url)) print("is") print("{0:.3f}".format(self._file.similarity)) print('////////////////////////////////////////////', end = "\n\n") threadLock.release() self._return = self._file
''' Started getting words from txts ''' print("\nGetting shingles for txt files") for f in files: if (f.processed == 1): words = [] f_tmp = open(f.path_txt,"r") text = f_tmp.read() f_tmp.close() candidate_keywords = generateCandidateKeywords(text, stopwords, lemmatizer) for sublist in candidate_keywords: for word in sublist: words.append(word) f.words = words f.shingles = shingles.gen_shingles(words) if (len(f.shingles) == 0): print("\nNo shingles were built in file \"{0}\"\nPossible, a file is empty - PDF parse error\n".format(f.path_txt)) f.processed = -1 """ Work with input file """ print("Started work with main file") input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf" input_file = file(path = input_file_path) try: retcode = subprocess.call(["python", "./pdf_import.py", f.path_pdf, f.path_txt]) if (retcode != 0): print("Error while parsing {0}".format(f.path_pdf))
EXTRA_TIME -= time() print("Getting keyword phrases took {0:.3f}".format(end_time - start_time), "seconds", end = "\n\n") if (output_file.path_txt != None): out = open(output_file.path_txt, "w") for key in keywords: out.write("{0}\n".format(key[0])) out.close() """ Started getting shingles for input file """ print("Started getting shingles for input file \"{0}\"".format(input_file.path_txt)) start_time = time() input_file.words = file_process.get_words(input_file, stopwords) input_file.shingles = shingles.gen_shingles(input_file.words) end_time = time() if len(input_file.shingles) == 0: print("No shingles were built in input file") exit(0) print("Shingles were built successfully for an input file \"{0}\"".format(input_file.path_txt)) print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n") """ Started building query for Google """ query = "(" for wordphrase in input_file.keywords: query += wordphrase[0] + ")+(" query = query[:-2] print("Getting links from Google search")