def computeScores(inputDir, outCSV, acceptTypes): with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2) for file1, file2 in files_tuple: try: row_cosine_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) v1 = Vector(file1, ast.literal_eval(file1_parsedData["content"])) v2 = Vector(file2, ast.literal_eval(file2_parsedData["content"])) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance) except ConnectionError: sleep(1) except KeyError: continue except Exception, e: pass
def run_exit_tool_on_known_type(dir_list): file_list = get_file_list(dir_list) for entry in file_list: parser.from_file(entry) return
def computeScores(inputDir, outCSV, acceptTypes, allKeys): na_metadata = ["resourceName"] with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) filename_list = [filename for filename in filename_list if parser.from_file(filename)] if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." files_tuple = itertools.combinations(filename_list, 2) for file1, file2 in files_tuple: row_edit_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) intersect_features = set(file1_parsedData["metadata"].keys()) & set(file2_parsedData["metadata"].keys()) intersect_features = [feature for feature in intersect_features if feature not in na_metadata ] file_edit_distance = 0.0 for feature in intersect_features: file1_feature_value = stringify(file1_parsedData["metadata"][feature]) file2_feature_value = stringify(file2_parsedData["metadata"][feature]) feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value)) file_edit_distance += feature_distance if allKeys: file1_only_features = set(file1_parsedData["metadata"].keys()) - set(intersect_features) file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata] file2_only_features = set(file2_parsedData["metadata"].keys()) - set(intersect_features) file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata] file_edit_distance += len(file1_only_features) + len(file2_only_features) file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features)) else: file_edit_distance /= float(len(intersect_features)) #average edit distance row_edit_distance.append(1-file_edit_distance) a.writerow(row_edit_distance)
def command(in_dir, out_dir, tika_server): create_dirs(out_dir) in_files = get_files(in_dir) for fi in in_files: if tika_server: parsed = parser.from_file(fi, tika_server) else: parsed = parser.from_file(fi) out_file = out_file_name(out_dir, fi, 'txt') with codecs.open(out_file, 'wb', encoding='utf-8') as f: f.write(parsed['content'])
def intersect(json_filename, output_name, index_file, start_index=0, end_index=yaoner.MAX_INT_VALUE): base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/' if index_file is None: index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt' with open(json_filename) as json_file: json_data = json.load(json_file) concept_dictionary = dict() for key in json_data.keys(): concept_dictionary[key.lower()] = {} file_list = yaoner.read_index_file(index_file, base_directory, start_index, end_index) for idx, val in enumerate(file_list): print(start_index + idx) parsed = parser.from_file(''.join([base_directory, val])) if 'content' in parsed and parsed['content'] is not None: content = parsed['content'] words = content.split() for word in words: lowercased = word.lower() if lowercased in concept_dictionary: last_part = os.path.basename(val) concept_dictionary[lowercased][last_part] = 1 dump(concept_dictionary, output_name + 'from' + str(start_index) + 'to' + str(end_index) + '.json') return
def extract(path): parsed = parser.from_file(path) content = parsed["content"] ners = StanfordExtractor(content).extract() entities = CustomEntityExtractor(content).extract() quantities = QuantityExtractor(content).getQuantities() if len(ners['LOCATION']) > 0: l = GeoTopic(map(lambda l: l['name'], ners['LOCATION'])) geo = l.getInfo() locations = l.getLocations() else: geo = [ ] locations = [ ] return { 'geo' : geo, 'locations' : locations, 'entities': entities['entities'], 'places': ners['LOCATION'], 'dates': ners['DATE'], 'quantities': quantities, 'metadata': parsed['metadata'], 'mime-type': parsed['metadata']['Content-Type'], 'id': idf.set(path) }
def run_ner(start_index=0, end_index=MAX_INT_VALUE): index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt' base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/' file_list = read_index_file(index_file, base_directory, start_index, end_index) measurement_list = [] index = 0 + start_index for entry in file_list: print(index) parsed = parser.from_file(''.join([base_directory, entry])) if 'metadata' in parsed: if 'X-TIKA:EXCEPTION:embedded_exception' in parsed['metadata']: index += 1 continue if 'content' in parsed: if parsed['content'] is not None: # print(json.dumps(parsed['metadata'], indent=4)) # print(parsed['content']) # print('content size ', len(parsed['content'])) if len(parsed['content']) > 1 * 1024 * 1024: index += 1 continue measurements = extract_measurement(parsed['content']) if measurements is not None and len(measurements) > 0: measurement_list.append({entry.split('/')[-1]: measurements}) index += 1 dump_to_json(measurement_list, '/Users/Frank/working-directory/ner-measurement-mentions/', 'from' + str(start_index) + 'to' + str(end_index)) return
def filterFiles(inputDir, acceptTypes): filename_list = [] for root, dirnames, files in os.walk(inputDir): dirnames[:] = [d for d in dirnames if not d.startswith('.')] for filename in files: if not filename.startswith('.'): filename_list.append(os.path.join(root, filename)) filename_list = [filename for filename in filename_list if parser.from_file(filename)] if acceptTypes: filename_list = [filename for filename in filename_list if str(parser.from_file(filename)['metadata']['Content-Type'].encode('utf-8')).split('/')[-1] in acceptTypes] else: print "Accepting all MIME Types....." return filename_list
def compareValueSimilarity (fileDir, encoding = 'utf-8') : union_feature_names = set() file_parsed_data = {} resemblance_scores = {} file_metadata={} for filename in fileDir: file_parsed = [] parsedData = parser.from_file(filename) file_metadata[filename] = parsedData["metadata"] for key in parsedData["metadata"].keys() : value = parsedData["metadata"].get(key)[0] if isinstance(value, list): value = "" for meta_value in parsedData["metadata"].get(key)[0]: value += meta_value file_parsed.append(str(key.strip(' ').encode(encoding) + ": " + value.strip(' ').encode(encoding))) file_parsed_data[filename] = set(file_parsed) union_feature_names = union_feature_names | set(file_parsed_data[filename]) total_num_features = len(union_feature_names) for filename in file_parsed_data.keys(): overlap = {} overlap = file_parsed_data[filename] & set(union_feature_names) resemblance_scores[filename] = float(len(overlap))/total_num_features sorted_resemblance_scores = sorted(resemblance_scores.items(), key=operator.itemgetter(1), reverse=True) return sorted_resemblance_scores, file_metadata
def load_topics(filename): languages.append(language.from_file(filename)) parser_obj = parser.from_file(filename) if 'content' in parser_obj and parser_obj['content']: words.extend(get_nouns(parser_obj['content'])) if 'metadata' in parser_obj: metadata_dict = parser_obj['metadata'] if 'Author' in metadata_dict: if type(metadata_dict['Author']) == type([]): metadata.append(metadata_dict['Author'][0]) else: metadata.append(metadata_dict['Author']) if 'xmp:CreatorTool' in metadata_dict: if type(metadata_dict['xmp:CreatorTool']) == type([]): metadata.extend(metadata_dict['xmp:CreatorTool']) else: metadata.append(metadata_dict['xmp:CreatorTool']) if 'Content-Type' in metadata_dict: if type(metadata_dict['Content-Type']) == type([]): metadata.append(metadata_dict['Content-Type'][0]) else: metadata.append(metadata_dict['Content-Type']) if 'Company' in metadata_dict: if type(metadata_dict['Company']) == type([]): metadata.append(metadata_dict['Company'][0]) else: metadata.append(metadata_dict['Company'])
def __init__(self, fileName): parsed = parser.from_file(fileName) metadata = parsed["metadata"] # Return re.sub('[\s+]', '', content) # TODO: Delete... Very Redundant.. content = parsed["content"] content = content.replace('\n', '') content = content.replace('\t', '') content = content.replace('\'', '') content = content.replace('\"', '') rx = re.compile('\W+') content = rx.sub(' ', content).strip() self.content = content # Title... try: title = metadata['title'] except: title = 'Untitled' title = title.replace('\t', '') title = title.replace('\t', '') title = title.replace('\'', '') title = title.replace('\"', '') title = rx.sub(' ', title).strip() self.title = title # self.type = self.metadata['Content-Type-Hint'] # self.name = self.metadata['resourceName'] # lanFix = re.sub('[\s+]', '', content) self.lang = language.from_file(fileName)
def getKeywords(pdfFile,Occur): tikaurl= tika_obo.getTikaAddress() parsed = parser.from_file(pdfFile, tikaurl) metadata = parsed["metadata"] doccontent = parsed["content"] fullwordlist = obo.stripNonAlphaNum(doccontent) wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) dictionary = obo.wordListToFreqDict(wordlist) sorteddict = obo.sortFreqDict(dictionary) count = 0 keywords = [] shortkey = [] maxoccur = Occur for s in sorteddict: numocc = int(s[0]) word = s[1].encode('utf-8') if numocc > maxoccur: keyword = { word : str(numocc) } keywords.append(keyword) if len(word)>6: shortkey.append(word.lower()) count = count + 1 if Occur > 0: return shortkey return keywords
def main(file_name): fi = open("sentences.txt", "w+") fi_summary = open("summary.txt", "w+") fi_cool = open("wtv.txt", "w+") score_sentences = SentenceScores() parsed = parser.from_file(file_name) print parsed["metadata"] content = parsed["content"] content = content.strip() fi_cool.write(content.encode("utf-8")) sentences = content.split(". ") sentences = map(clean_sentence, sentences) lines = score_sentences.get_summary_lines(sentences) max_len = len(lines) / 3 needed_lines = lines[0:max_len] sorted_lines = sorted(needed_lines, key=lambda x: x[0]) for line_num, score in sorted_lines: fi_summary.write((str(line_num+1)+", "+sentences[line_num]).encode("utf-8")) for sentence in sentences: fi.write(sentence.encode("utf-8")) fi.close() fi_summary.close()
def getTikaTags(filename): import tika from tika import parser import obo import tika_obo import gethavens tikaUrl = getTikaAddress() parsed = parser.from_file(filename, tikaUrl) metadata = parsed["metadata"] content = parsed["content"] jsonprops = {'cm:title': str(metadata['resourceName'])} for key in metadata: newkey = str(key) value = str(metadata[key]) jsonprops[newkey] = value title = jsonprops['resourceName'] namebreak = title.split('.') havenrecord = gethavens.getPropertiesHaven(str(jsonprops['resourceName'])) jsonprops['Description'] = 'Ranked:' + str(havenrecord['rank']) \ + ' most secretive Tax Haven\nhttps://www.google.co.uk/maps/place/' \ + havenrecord['country'] jsonprops['Name'] = havenrecord['country'] jsonprops['cmis:title'] = str(title) jsonprops['cmis:author'] = 'admin' return jsonprops
def _request_pdf_data(self, url): parsed = parser.from_file(url) return { 'url': url, 'title': self._parse_pdf_title(parsed), 'body': self._parse_pdf_body(parsed) }
def search_content(file_path, expressions): """Open a file and search it's contents against a set of RegEx.""" matches = [] count = 0 data = parser.from_file(file_path) # Read into an I/O buffer for better readline support if not data: # There is no content that could be extracted return matches content = io.StringIO(data['content']) # TODO this may create a very large buffer for larger files # We may need to convert this to a while readline() loop for line in content.readlines(): count += 1 # count the number of lines if line: for rex in expressions: # Check if the line matches all the expressions res = rex.regex.search(line) if res: # If there's a match append to the list matches.append(cazobjects.CazRegMatch(res, file_path, count, rex.name)) return matches
def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: text_array = [] i = 0 d = pdf.Document(fname) for i, p in enumerate(d, start=1): for f in p: for b in f: for l in b: text_array.append(l.text.encode('UTF-8')) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return '\n'.join(text_array) except: # reraise everything raise else: try: content = parser.from_file(fname)['content'] return (content or '').encode('UTF-8') except: # reraise everything raise
def makeSearchable(self, src, subdir): rootDir = subdir + "/examplePDFs" pdfPath = rootDir + "/" + "rawPdfs" finishedTextPath = rootDir + "/" + "finishedText" removed_text_path = rootDir + "/" + "removedText" gsPath = rootDir + "/" + "gsPdfs" imagesProcessedPath = rootDir + "/" + "imagesProcessed" imageText = rootDir + "/" + "imageText" if not os.path.exists(pdfPath): os.makedirs(pdfPath) if not os.path.exists(finishedTextPath): os.makedirs(finishedTextPath) if not os.path.exists(removed_text_path): os.makedirs(removed_text_path) if not os.path.exists(gsPath): os.makedirs(gsPath) if not os.path.exists(imagesProcessedPath): os.makedirs(imagesProcessedPath) if not os.path.exists(imageText): os.makedirs(imageText) filename, fileType = src.rsplit(".", 1) print("\n**********************") print("Processing file: " + filename) print("**********************\n") # Extact easy text print("Getting text that can be easily extracted...") rawText = parser.from_file(pdfPath + "/" + src) if rawText["content"] is None: print("Found no text to extract, continuing process") else: fileOutput = open(finishedTextPath + "/" + filename + ".txt", 'w') fileOutput.write(rawText["content"].encode("utf-8")) fileOutput.close() # Remove text from pdf print("Removing text from pdf") process1 = subprocess.Popen(['java', '-jar', 'PdfTextDeleter.jar', src, os.path.join(removed_text_path, src)]) process1.wait() # Apply ghostscript to removed text pdfs if not os.path.exists(gsPath + "/" + filename + "-imgs"): os.makedirs(gsPath + "/" + filename + "-imgs") if not os.path.exists(rootDir + "/imagesProcessed/" + filename + "-imgs"): os.makedirs(rootDir + "/imagesProcessed/" + filename + "-imgs") if not os.path.exists(rootDir + "/imageText/" + filename + "-imgs"): os.makedirs(rootDir + "/imageText/" + filename + "-imgs") print("Converting left over pdf to images") process2 = subprocess.Popen(["gs", "-dNOPAUSE", "-sFONTPATH=/opt/local/share/ghostscript/9.16/Resource/Font/", "-sDEVICE=pngalpha", "-r300", "-dBATCH", "-sOutputFile=" + gsPath + "/" + filename + "-imgs" + "/" + filename + "-%03d" ".png", removed_text_path + "/" + src], env={'PATH': '/opt/local/bin/'}) process2.wait() self.preprocessImages(rootDir, subdir, src) self.applyOCRToImages(rootDir, subdir, src) self.mergeTextFiles(rootDir, subdir, src)
def parse_file(self, path): """ Parses a file at given path :param path: path to file :return: parsed content """ parsed = tkparser.from_file(path) parsed['file'] = os.path.abspath(path) return parsed
def computeScores(inputDir, outCSV, acceptTypes): with open(outCSV, "wb") as outF: a = csv.writer(outF, delimiter=',') a.writerow(["x-coordinate","y-coordinate","Similarity_score"]) files_tuple = itertools.combinations(filterFiles(inputDir, acceptTypes), 2) for file1, file2 in files_tuple: row_cosine_distance = [file1, file2] file1_parsedData = parser.from_file(file1) file2_parsedData = parser.from_file(file2) v1 = Vector(file1_parsedData["metadata"]) v2 = Vector(file2_parsedData["metadata"]) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance)
def test_tika_solr(): s = create_connection(u"Test") file_path = u"testdata/example.pdf" parsed = parser.from_file(file_path) log_parsed(parsed) s.add([parsed], commit=True) return 1, 0
def get_measurements(filename): parser_obj = parser.from_file(filename) if 'content' in parser_obj and parser_obj['content']: return [x for x in regextagger.tag(tokenizer.tokenize(parser_obj['content'])) if x[1] != 'OTHER'] f_read=open(sys.argv[1],'r') given_text=f_read.read(); segmented_lines=nltk.sent_tokenize(given_text) for text in segmented_lines: words=word_tokenize(text) sent = t_gram_tag.tag(words) print given_text(sent)
def convert( filepath, output ): parsed = parser.from_file( filepath ) if output: basename, ext_pdf = os.path.splitext( os.path.basename( filepath ) ) output_path = os.path.join( output, basename + '.json' ) else: extensionless_filepath, ext_pdf = os.path.splitext( filepath ) output_path = extensionless_filepath + '.json' with open( output_path, 'wt' ) as textfile: json.dump( parsed, textfile, ensure_ascii=True )
def extractText(self, params): ''' Using Tika to extract text from given file and return the text content. ''' file_name = params['file_name'] parsed = parser.from_file(file_name) status = IndexUploadedFilesText(file_name, parsed["content"]) if status[0]: return {'job':'text_extraction', 'status': 'successful', 'comment':'Text extracted and indexed to Solr.'} else: return {'job':'text_extraction', 'status': 'unsuccessful', 'comment':status[1]}
def main(): # read the folder name from argument arg_parser = argparse.ArgumentParser(description='Detecting near duplicates using SimHashes') arg_parser.add_argument('-f', '--folder', help='Folder with all the images', required=True) arg_parser.add_argument('-t', '--tika', help='Path to a running tika server', required=True) arg_parser.add_argument('-cb', '--contentbased', help='true/false. Use content in deduplication calculation. Default = false (Must have Tika OCR enabled/ Have Tesseract Installed)', required=False) args = arg_parser.parse_args() SIM_HASH = defaultdict() # read all files for root, dirs, files in os.walk(args.folder): # grab metadata from each file and write it to an output file files = sorted(files) for f in files: path = root + f parsed_data = parser.from_file(path, args.tika) if args.contentbased: if args.contentbased.lower() == 'true': SIM_HASH[f] = get_simhash(parsed_data, True) else: SIM_HASH[f] = get_simhash(parsed_data, False) # make clusters SORTED_HASH = sorted(SIM_HASH.items(), key=operator.itemgetter(1)) DISTANCES = OrderedDict() DISTANCES[SORTED_HASH[0][0]] = None for i in range(1, len(SORTED_HASH)): DISTANCES[SORTED_HASH[i][0]] = simhash.get_hamming_distance(SORTED_HASH[i - 1][1], SORTED_HASH[i][1]) # cluster images together cluster_number = 0 CLUSTERS = defaultdict(list) for key, value in DISTANCES.iteritems(): print key + ": " + str(value) if value is None: CLUSTERS[cluster_number].append(key) else: if value <= THRESHOLD: CLUSTERS[cluster_number].append(key) else: cluster_number += 1 CLUSTERS[cluster_number].append(key) print '*' * 10 + 'CLUSTERS' + '*' * 10 for key, value in CLUSTERS.iteritems(): print 'CLUSTER ' + str(key) + ':' for x in value: print '\t' + x return
def parse_files(file_name): print("parsing file : % \n", file_name) parsed = parser.from_file(file_name) print("meta-data:\n") print(parsed["metadata"]) print("content:\n") content = parsed["content"] c2 = content.encode('utf-8').strip() print(c2) print("\n\n");
def extractParsingInfo(self): FileSizeList = [] # Getting the files whose size would be computed response = MIME_Core().facetQuery('metadata') mimeTypeResponse = response.result.dict['facet_counts']['facet_fields']['metadata'] mimeList = [] for mime_type, count in mimeTypeResponse.iteritems(): if mime_type == 'application/java-archive': continue mimeList.append(mime_type) mime_size_diversity = {} for mime in mimeList: metadata_list = {} print mime[mime.index('/')+1:] query = 'metadata:%s' % (mime) response = MIME_Core().queryAll(query=query, rows = 100) files = response.result.dict['response']['docs'] for file in files: parsed = parser.from_file(file['file'][0]) if 'metadata' in parsed: metadata = parsed['metadata'] for key,value in metadata.iteritems(): if key in mime_size_diversity: mime_size_diversity[key] += 1 else: mime_size_diversity[key] = 1 pass print 'done with ' + mime top_metadata = sorted(mime_size_diversity.items(), key=operator.itemgetter(1), reverse=True) metadata = [] for item in top_metadata[:20]: metadata.append(item[0]) metadata.append(item[1]) pass out_file = open('data/word_cloud/word_cloud.json',"w") json.dump(metadata,out_file, indent=4) pass
def extract_text(request, file_name): ''' Using Tika to extract text from given file and return the text content. ''' if "none" in IndexStatus("text", file_name): parsed = parser.from_file("{0}/{1}/{2}".format(APP_NAME, UPLOADED_FILES_PATH, file_name)) status = IndexUploadedFilesText(file_name, parsed["content"]) if status[0]: return HttpResponse(status=200, content="Text extracted.") else: return HttpResponse(status=400, content="Cannot extract text.") else: return HttpResponse(status=200, content="Loading...")
def scan(filelist, conf=DEFAULTCONF): results = [] for f in filelist: metadata = parser.from_file(f).get('metadata', {}) for field in conf['remove-entry']: if field in metadata: del metadata[field] results.append((f, metadata)) metadata = {} metadata["Name"] = NAME metadata["Type"] = TYPE return results, metadata
def spell_checker(self, page, stopwordsList): driver = self.driver driver.implicitly_wait(2) driver.get(page) self.f.write('--- checking for spelling %s\n' %page) allTextOnPage = parser.from_file(page)['content'].encode('utf-8') allTextOnPage = re.findall('[a-z]+', allTextOnPage.lower()) stopwordsList.extend(stopwords.words('english')) allTextOnPage = [w for w in allTextOnPage if not w in stopwordsList] for word in allTextOnPage: if not wordnet.synsets(word): print 'Is this correct? ', word self.f.write('Is this word correct? %s\n' %word)