예제 #1
1
파일: views.py 프로젝트: vickimo/shakev3
def upload(request):
	# user uploads a document -> convert into a dict of the terms found
	if request.FILES:
		if 'file' in request.FILES:
			result = ''
			f = request.FILES['file']
			fp = 'shake_v3/static/data/' + str(f)
			fp2 = fp[:len(fp)-3] + 'txt'
			if fp[len(fp)-3:len(fp)] == 'pdf':
				with open(fp, 'wb+') as pdff:
					for chunk in f.chunks():
						pdff.write(chunk)
				result = pdf_to_txt(fp)
				with open(fp2, 'wb+') as txtf:
					txtf.write(result)			
			elif fp[len(fp)-3:len(fp)] == 'rtf':
				with open(fp, 'wb+') as rtff:
					for line in f:
						rtff.write(line)
				doc = Rtf15Reader.read(open(fp, 'rb'))
				doctxt = PlaintextWriter.write(doc).getvalue()
				with open(fp2, 'wb+') as txtf:
					for line in doctxt:
						txtf.write(line)
				f = str(f)[:-4] + ".txt"
				result = doctxt
			else:
				with open(fp2, 'wb+') as txtf:
					for line in f:
						txtf.write(line)
				result = open(fp2, 'r').read()
		response_dict = generate_term_dict(result)
		response_dict['fp'] = 'static/data/' + str(f)
		return HttpResponse(simplejson.dumps(response_dict), mimetype='application/javascript')
	# user indicates terms -> give a grade
	elif request.POST:
		#TO DO: implement saving the data
		rating = ""
		score = custom_POST_to_score(request)
		if score > 4.5:
			rating = 'A+'
		elif score > 4:
			rating = 'A'
		elif score > 3.5:
			rating = 'B+'
		elif score > 3:
			rating = 'B'
		elif score > 2.5:
			rating = 'C+'
		elif score > 2:
			rating = 'C'
		elif score > 1:
			rating = 'D'
		else:
			rating = 'F'
		return HttpResponse(rating)
	# display the upload part 1
	else:
		score = 0
		return render_to_response('upload.html', {'score': score}, context_instance = RequestContext(request))
예제 #2
1
def convert_to_txt(file_path):
    logger.debug("convert_to_txt: %s" % file_path)
    words = None
    if not os.path.exists(file_path):
        logger.error("missing file %s", file_path)
    file_size = os.stat(file_path).st_size
    logger.debug("convert_to_txt: %d bytes at %s",file_size, file_path)
    ext = _get_extension(file_path)
    if ext == '.txt':
        logger.debug("loading txt file")
        worked = False
        try:
            encoding, file_handle, words = open_with_correct_encoding(file_path)
        except Exception as e:
            logger.error("Wasn't able to read the words from the file %s" % file_path)
            words = ""
    elif ext == '.docx':
        logger.debug("loading docx file")
        words = _docx_to_txt(file_path)
    elif ext == '.rtf':
        logger.debug("loading rtf file")
        doc = Rtf15Reader.read(open(file_path))
        words = PlaintextWriter.write(doc).getvalue()
    else:
        logging.warning("Couldn't find an extension on the file, so assuming text")
        with codecs.open(file_path, 'r', ENCODING_UTF_8) as myfile:
            words = myfile.read()
    logger.debug("loaded %d chars" % len(words))
    return words
def get_one_month_from_rtf(url):
	rtf_file = urllib2.urlopen(url)
	rtf_file = StringIO(rtf_file.read())
	doc = Rtf15Reader.read(rtf_file)

	final_data = []

	header = False
	for c in doc.content:
	    full_p = c.content.__repr__().lower()
	    if "capacity" in full_p and "use cna" in full_p:
	        
	        header = True
	        continue
	        
	    if header:
	        row= re.split(r"\t", c.content[0].content[0])
	        if len(row) == 7 :
	            final_data.append(row)

	df = pd.DataFrame(final_data, columns = ["prison_name","baseline_cna", "in_use_cna", "operational_capacity", "population", "perc_pop_to_used_cna", "perc_acc_available"])

	df.iloc[:,1:] = df.iloc[:,1:].replace("%", "", regex=True).replace(",", "", regex=True)



	for c in df.columns:
	    df[c]= pd.to_numeric(df[c], errors='ignore')

	cols = [c for c in df.columns if "perc" in c]
	df.loc[:,cols] = df.loc[:,cols]/100
	return df
예제 #4
0
def rtf_to_text(value):
    if len(value) == 0:
        return value
    rtf_doc = Rtf15Reader.read(BytesIO(value.encode("latin_1")))
    txt_doc = BytesIO()
    PlaintextWriter.write(rtf_doc, txt_doc, encoding="latin_1")
    return txt_doc.getvalue().decode("latin_1")
예제 #5
0
def rtf(f):
    doc = Rtf15Reader.read(open(f, "rb"))
    result = []
    for element in doc.content:
        for text in element.content:
            result.append("".join(text.content))
    return "".join(result)
예제 #6
0
def convertRtfToText(path):
    from pyth.plugins.rtf15.reader import Rtf15Reader
    from pyth.plugins.plaintext.writer import PlaintextWriter

    doc = Rtf15Reader.read(open('sample.rtf'))

    print(PlaintextWriter.write(doc).getvalue())
예제 #7
0
def GetExternal(version, odl_data, source, class_id):
    external = ""

    for item in version[2]:
        if item[0] == "Attribute" \
                and item[1] == "_Art1_RTF":

            if len(item[2]) == 2:
                if isinstance(source, ZipFile):
                    data = source.open(item[2][0]).read()
                else:
                    file_name = join(source, item[2][0])
                    f = open(file_name, 'rb')
                    data = f.read()
                    f.close()
                data = data.replace("\x0c", "")
            elif len(item[2]) == 1:
                data = item[2][0]

            if data == "":
                return ""

            f = StringIO()
            f.write(data)
            doc = Rtf15Reader.read(f, clean_paragraphs = False)
            external = PlaintextWriter.write(doc).getvalue()
            external = external.replace("\n\n", "\n")

    return ReplaceTextNames(external, version, odl_data, class_id)
예제 #8
0
 def read_recommendations(self, file_name):
     """
     Function reads the targeted values from the file "WHO Daily Recommended Values.rtf"
     It process the entries and creates a dictionary with
     Nutrient name as Key and Nutrient Value as value
     :param file_name:
     :return:
     """
     target = dict()
     filtered_col = list()
     doc = Rtf15Reader.read(open(file_name))
     entities = PlaintextWriter.write(doc).getvalue().split('\n\n')
     for item in entities:
         splited = item.split(',')
         name = splited[0].split('(')[0]
         value = splited[1]
         try:
             unit = splited[0].split('(')[1].split(')')[0]
         except:
             unit = ''
         # target.append({'nutrient': name,
         # 'unit': unit,
         # 'value': value})
         target.update({name: value})
         filtered_col.append(name)
     self.target_values = target
     return target, filtered_col
def main():
    '''
    Purpose::

    Input::
    
    Output::
    
    Assumptions::
    '''
    # Get arguments
    args = parse_arguments()
    if args.url:
        url = args.url

    # Get file and read it into structure
    try:
        with open(url, 'rb') as rtffile:
            judges = extract_terms(Rtf15Reader.read(rtffile))
            #print PlaintextWriter.write(doc).getvalue()
                
    except IOError as e:
        print 'An error occured fetching %s \n %s' % (url, e.reason)   
        return 1

    f = open('US_legal_lexicon.txt', 'w')
    # Print data
    #f.write("\n".join(str(i).encode('utf8') for i in judges))
    for i in judges:
        f.write((i).encode('utf8') +'\n')

    f.close()
예제 #10
0
    def compute(self):
        """ compute() -> None
        Dispatch the HTML contents to the spreadsheet
        """
        filename = self.get_input("File").name

        text_format = self.get_input("Format")
        with open(filename, 'rb') as fp:
            if text_format == 'html':
                html = fp.read()  # reads bytes
            elif text_format == 'rtf':
                try:
                    py_import('pyth', {'pip': 'pyth'})
                except ImportError:
                    raise ModuleError(
                        self, "'rtf' format requires the pyth "
                        "Python library")
                else:
                    from pyth.plugins.rtf15.reader import Rtf15Reader
                    from pyth.plugins.xhtml.writer import XHTMLWriter
                    doc = Rtf15Reader.read(fp)
                    html = XHTMLWriter.write(doc).read()  # gets bytes
            else:
                raise ModuleError(self, "'%s' format is unknown" % text_format)

        self.displayAndWait(RichTextCellWidget, (html, ))
예제 #11
0
    def compute(self):
        """ compute() -> None
        Dispatch the HTML contents to the spreadsheet
        """
        filename = self.get_input("File").name

        text_format = self.get_input("Format")
        with open(filename, 'rb') as fp:
            if text_format == 'html':
                html = fp.read() # reads bytes
            elif text_format == 'rtf':
                try:
                    py_import('pyth', {'pip': 'pyth'})
                except ImportError:
                    raise ModuleError(self, "'rtf' format requires the pyth "
                                      "Python library")
                else:
                    from pyth.plugins.rtf15.reader import Rtf15Reader
                    from pyth.plugins.xhtml.writer import XHTMLWriter
                    doc = Rtf15Reader.read(fp)
                    html = XHTMLWriter.write(doc).read() # gets bytes
            else:
                raise ModuleError(self, "'%s' format is unknown" % text_format)

        self.displayAndWait(RichTextCellWidget, (html,))
예제 #12
0
def main():
    '''
    Purpose::

    Input::
    
    Output::
    
    Assumptions::
    '''
    # Get arguments
    args = parse_arguments()
    if args.url:
        url = args.url

    # Get file and read it into structure
    try:
        with open(url, 'rb') as rtffile:
            judges = extract_terms(Rtf15Reader.read(rtffile))
            #print PlaintextWriter.write(doc).getvalue()

    except IOError as e:
        print 'An error occured fetching %s \n %s' % (url, e.reason)
        return 1

    f = open('US_legal_lexicon.txt', 'w')
    # Print data
    #f.write("\n".join(str(i).encode('utf8') for i in judges))
    for i in judges:
        f.write((i).encode('utf8') + '\n')

    f.close()
예제 #13
0
def decode_cell(cell):
    '''The cell matched so lets handle it'''
    
    # variable that will hold the converted text
    temp_cell = []
    
    # pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
    cell_encode = re.sub(r'\\u|\\\\u|\\N|\\\\N', ' ', cell)
    cell_encode = cell_encode.decode('unicode_escape')
    cell_encode = filter(lambda x: x in string.printable, cell_encode)
    cell_rtf = Rtf15Reader.read(StringIO(cell_encode))

    # turn the pyth object into readable text
    cell_txt = [x.content for x in cell_rtf.content]
    
    # iterate and extract the pyth object text into temp_cell
    for line in cell_txt:
        for l in line:
            temp_cell.append(l.content)
                
    
    # combine and join the extracted text into one string (for one cell)
    combined = [i for sub in temp_cell for i in sub]
    new_cell =  ' '.join(combined)
    
    # the non-ascii characters in your file were followed by _ so i removed them for cleanliness
    # uncomment to keep the _
    new_cell = re.sub('_', '', new_cell)
    
    # remove extra whitespace and return the converted cell
    # remove L at end of string
    return ' '.join(new_cell[:-1].split())
예제 #14
0
def convert_to_txt(file_path):
    logger.debug("convert_to_txt: %s" % file_path)
    if not os.path.exists(file_path):
        logger.error("missing file %s", file_path)
    file_size = os.stat(file_path).st_size
    logger.debug("convert_to_txt: %d bytes at %s", file_size, file_path)
    ext = _get_extension(file_path)
    if ext == '.txt':
        logger.debug("loading txt file")
        try:
            encoding, file_handle, words = open_with_correct_encoding(
                file_path)
        except Exception:
            logger.error("Wasn't able to read the words from the file %s" %
                         file_path)
            words = ""
    elif ext == '.docx':
        logger.debug("loading docx file")
        words = _docx_to_txt(file_path)
    elif ext == '.rtf':
        logger.debug("loading rtf file")
        doc = Rtf15Reader.read(open(file_path))
        words = PlaintextWriter.write(doc).getvalue()
    else:
        logging.warning(
            "Couldn't find an extension on the file, so assuming text")
        with open(file_path, 'r') as myfile:
            words = myfile.read()
    logger.debug("loaded %d chars" % len(words))
    return words
예제 #15
0
def analyze(committeeFile):
    
    try:
        doc = Rtf15Reader.read(open(committeeFile, "rb"))
    except:
        print "%s - skipped..." % committeeFile
        errFile = committeeFile.replace(global_options.indir, global_options.errdir)
        shutil.copyfile(committeeFile, errFile)
        return False

    #print PlaintextWriter.write(doc).getValue()

    f = open("test.out", 'w')
    f.write(PlaintextWriter.write(doc).getvalue())
    f.close()

    f = open("test.out", 'r')
    participants = find_participants(f.read())
    f.close()

    # Getting the indication whether the participant spoke in the committee
    f = open("test.out", 'r')
    docstring = f.read()
    for line in docstring.splitlines():
        name = ''
        if ":" in line:
            participant = line.split(":")[0]
            for p in participants:
                if participant in p['name']:
                    p['speaker'] = True
                    p['speak_count'] += 1

    f.close()

    fname = committeeFile.replace(global_options.indir, global_options.outdir)
    fname = fname.replace("rtf", "txt")
    file = codecs.open(fname, "w", "utf-8")

    for participant in participants:
        string_builder = []
        for key, val in participant.iteritems():
            string = u"'%s': '%s'"
            if val is not None:
                if type(val) == str:
                    val = val.replace("'", "")
                    val = val.replace('"', '')
                string = string % (key, print_unicode(val))
                string_builder.append(string)
        wrt_ln = ', '.join(string_builder)
        wrt_ln += ',\n'
        try:
            file.write(wrt_ln)

        except UnicodeEncodeError:
            print wrt_ln

    file.close()
    verbose("Generated participants file: " + fname)
    return True
예제 #16
0
 def test_inline_png(self):
     sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
     with open(sample_with_image, 'rb') as rtf:
         doc = Rtf15Reader.read(rtf)
         image = next(node.content[0] for node in doc.content if isinstance(node.content[0], pyth.document.Image))
         expected = {'pngblip': True, 'picw': '20714', 'picwgoal': '750', 'pich': '12143',
                     'pichgoal': '750', 'picscaley': '100', 'picscalex': '100'}
         self.assertEquals(expected, image.properties)
예제 #17
0
 def test_inline_png(self):
     sample_with_image = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "sample-with-image.rtf")
     with open(sample_with_image, 'rb') as rtf:
         source = Rtf15Reader.read(rtf)
         doc = XHTMLWriter.write(source).getvalue()
         self.assertIn('<img src="data:image/png;base64,', doc)
         self.assertIn('width:50px', doc)
         self.assertIn('height:50px', doc)
예제 #18
0
def main():
    if len(sys.argv) < 2:
        print("usage %s <rtf_file_name> <txt_file_name>")
    else:
        doc = Rtf15Reader.read(open(os.path.join(sys.argv[1])))
        txt_filename = sys.argv[2]
        with open(os.path.join(txt_filename), "w") as of:
            of.write(PlaintextWriter.write(doc).getvalue())
예제 #19
0
def rtf(f):
    with open(f, "rb") as f:
        doc = Rtf15Reader.read(f)
    result = []
    for element in doc.content:
        for text in element.content:
            result.append(''.join(text.content))
    return '\r\n'.join(result)
예제 #20
0
def load_stickies(path):
    stickies = []
    with open(path) as fd:
        for i,rtf in enumerate(parse_sticky_database(fd.read())):
            doc = Rtf15Reader.read(StringIO.StringIO(rtf))
            plaintext = PlaintextWriter.write(doc).getvalue()
            stickies.append(plaintext)
    return stickies
예제 #21
0
	def parse(self, path):
		# Directory
		if os.path.isdir(path):
			raise NotImplementedError()
		# File
		else:
			doc = Rtf15Reader.read(open(path))
			sample = Sample(path, None, PlaintextWriter.write(doc).getvalue())
			return sample
예제 #22
0
def get_rtf_text(path):
	"""
	Take the path of an rtf file as an argument and return the text
	"""
	
		
	doc = Rtf15Reader.read(open(path))

	return PlaintextWriter.write(doc).getvalue()
예제 #23
0
 def readRtf(self, path):
     try:
         doc = Rtf15Reader.read(open(path, "rb"))
     except:
         self._log("Some screwy rtf shit going on with " + path)
         return "Can't process ur shitty rtf <3 dfbot"
     contents = PlaintextWriter.write(doc).getvalue()
     #print contents
     return contents
예제 #24
0
 def readRtf(self, path):
     try:
         doc = Rtf15Reader.read(open(path, "rb"))
     except:
         self._log("Some screwy rtf shit going on with " + path)
         return "Can't process ur shitty rtf <3 dfbot"
     contents = PlaintextWriter.write(doc).getvalue()
     #print contents
     return contents
예제 #25
0
    def test_read2(self):
        rtf = StringIO("""{\\rtf1\\ansi\\ansicpg1252\\cocoartf1343\\cocoasubrtf160\\cocoascreenfonts1{\\fonttbl\\f0\\fnil\\fcharset222 Thonburi;}
{\\colortbl;\\red255\\green255\\blue255;}
\\pard\\tx560\\tx1120\\tx1680\\tx2240\\tx2800\\tx3360\\tx3920\\tx4480\\tx5040\\tx5600\\tx6160\\tx6720\\pardirnatural\\qc

{\\f0\\fs24 \\cf0 \\'b9\\'e9\\'d3\\'b5\\'a1}""")
        doc = Rtf15Reader.read(rtf)
        text = PlaintextWriter.write(doc).read()
        print text
        self.assertEquals(u"น้ำตก", text.decode('utf8'))
예제 #26
0
def rtf_to_plain_text(file_name):
    print file_name
    out_file_name = './PlainText/%s.txt' % (file_name[:-4])
    fw = open(out_file_name, 'w')

    doc = Rtf15Reader.read(open(file_name, "r"))

    res = PlaintextWriter.write(doc).getvalue()
    fw.write(res)
    fw.close()
예제 #27
0
 def get_text(self):
     """
     return a unicode object from the rtf file
     """
     loc = self.get_file_loc()
     if loc:
         doc = Rtf15Reader.read(open(loc, "rb"))
         txt = PlaintextWriter.write(doc).getvalue()
         return txt.decode('utf-8')
     else:
         return u""
예제 #28
0
def clean_rtf(fname):
    doc = Rtf15Reader.read(open(fname))
    plain = PlaintextWriter.write(doc).getvalue()
    lines = plain.split("\n")
    # print '#############################\norig: %s' % pprint.pformat(lines[:10])
    lines = filter(lambda l: len(l) > 0, lines)
    # print "##############################\nno blank lines:\t%s" % pprint.pformat(lines[:10])
    lines = [line.split(";") for line in lines]
    lines = [[val[1:-1] for val in line] for line in lines]
    # print "##############################\nsplit lines:\t%s" % pprint.pformat(lines[:10])
    return lines
예제 #29
0
    def transform(self, data, options=None):
        if self._validate(data) is None:
            return None
        
        file = cStringIO.StringIO()
        file.write(''.join(self.filter(data)))
        file.seek(0)
        doc = Rtf15Reader.read(file, errors='replace')
        xhtml = XHTMLWriter.write(doc)
        xhtml_ = xhtml.read()
        xhtml.close()

        return TransformResult(StringIter(xhtml_))
예제 #30
0
def _rtf_to_txt(file_path, dst_dir, file_name):
    """
    Uses the pyth python module to extract text from a rtf file and save
    to .txt in dst_dir.
    """
    if file_name is None:
        file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name))
    doc = Rtf15Reader.read(open(file_path))
    txt = PlaintextWriter.write(doc).getvalue()
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
예제 #31
0
def _rtf_to_txt(file_path, dst_dir, file_name):
    """
    Uses the pyth python module to extract text from a rtf file and save
    to .txt in dst_dir.
    """
    if file_name is None:
        file_name = os.path.split(file_path)[1]
    file_dst = os.path.join(dst_dir, re.sub(r'\.rtf$', '.txt', file_name))
    doc = Rtf15Reader.read(open(file_path))
    txt = PlaintextWriter.write(doc).getvalue()
    txt = unidecode(txt)
    with open(file_dst, 'w') as f:
        f.write(txt)
    return 0
예제 #32
0
 def test_when_last_item_sublist_item(self):
     """ With structures like this, both lists were getting dropped
     Start
      * 1
        * 1.1
     """
     list_bug = os.path.join(os.path.abspath(os.path.dirname(__file__)), "rtfs", "list-bug.rtf")
     with open(list_bug, 'rb') as rtf:
         doc = Rtf15Reader.read(rtf)
         text = []
         traverse_text(doc, lambda x: text.append(x))
         self.assertIn('Start', text)
         self.assertIn('1', text)
         self.assertIn('1.1', text)
예제 #33
0
def getFileText(file_path, html=False, pdf_utf8=False):
    '''
    input: string of file path
    output: either raw string or parsed html text content
    '''
    file_extension = os.path.splitext(file_path)[1]
    if file_extension.lower() != ".py":
        if file_extension.lower() == ".html" or file_extension.lower(
        ) == '.htm':
            file_content = open(file_path).read()
            if html:
                try:
                    html_text = lh.fromstring(file_content).text_content()
                    return html_text
                except UnicodeDecodeError:
                    try:
                        html_text = lh.fromstring(
                            helpers.convert_encoding(
                                file_content)).text_content()
                    except UnicodeDecodeError:
                        html_text = lh.fromstring(
                            unicode(file_content,
                                    errors='ignore')).text_content()
                        return html_text
                    return html_text
            else:
                return file_content
        if file_extension == ".pdf":
            pdf_content = open(file_path, "rb")
            pdfReader = PyPDF2.PdfFileReader(pdf_content)
            num_pages = pdfReader.getNumPages()
            page_text = ""
            for i in range(0, num_pages):
                pageObj = pdfReader.getPage(i)
                page_text = page_text + " " + pageObj.extractText()
            # Need to check for pdfs that are just scanned images
            if len(page_text) <= num_pages:
                return None
            else:
                if pdf_utf8:
                    return page_text.encode('utf-8')
                else:
                    return page_text
        if file_extension == ".rtf":
            doc = Rtf15Reader.read(open(file_path))
            page_text = PlaintextWriter.write(doc).getvalue()
            uni_page_text = page_text.decode('utf-8')
            return uni_page_text
    return None
예제 #34
0
    def test_basic(self):
        """
        Try to read an empty rtf document
        """
        rtf = open('test.rtf', 'rb')
        rtf = rtf.read()
        # Read file content by chunks
        content = StringIO()
        content.write(rtf)

        content.seek(0)
        doc = Rtf15Reader.read(content)
        self.assert_(isinstance(doc.content[0], pyth.document.Paragraph))
        self.assert_(doc.content[2].content[0].content[0],
                     u"[` ~ ! @ # $ % ^ & * ( ) - _ = + [ { ] } \ | ; : ' 0x810x67 , < . > / ?]")
예제 #35
0
    def _convert_rtf_to_text(self, password=None):
	input_rtf = self.cvFile
	rtf = Rtf15Reader.read(open(input_rtf))
	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_rtf):
            inputPath = os.path.dirname(input_rtf)
    	input_filename = os.path.basename(input_rtf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	fw = open(self.cvTextFile, "w")
	fw.write(PlaintextWriter.write(rtf).getvalue())
	fw.close()
	return (0)
예제 #36
0
def Run(journal_file):
    raw_entries = plistlib.readPlist(journal_file)

    acc = utils.EntryAccumulator(lambda x: x['date'])
    for k, v in raw_entries.iteritems():
        if not v: continue
        # 12/29/2001 -> 2001-12-29
        new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k)
        d = parser.parse(new_k)

        if isinstance(v, plistlib.Data):
            f = StringIO.StringIO(v.data)
            try:
                doc = Rtf15Reader.read(f)
            except ValueError as e:
                print v.data
                raise e
            txt = PlaintextWriter.write(doc).getvalue()
            acc.add({'date': d, 'rtf': v.data, 'text': txt})
        else:
            acc.add({'date': d, 'text': v})

    for day, entries in acc.iteritems():
        assert len(entries) == 1
        entry = entries[0]

        if not entry['text']:
            continue

        summary = utils.SummarizeText(entry['text'])
        utils.WriteSingleSummary(day,
                                 maker='osxapp',
                                 summary=summary,
                                 dry_run=dry_run)
        if 'rtf' in entry:
            utils.WriteOriginal(day,
                                maker='osxapp',
                                contents=entry['rtf'],
                                filename='journal.rtf',
                                dry_run=dry_run)
        else:
            utils.WriteOriginal(day,
                                maker='osxapp',
                                contents=entry['text'].encode('utf8'),
                                filename='journal.txt',
                                dry_run=dry_run)
예제 #37
0
def loadAllRTFToDB(folderPath):
	db = DBController()
	for dirPath, dirNames, fileNames in os.walk(folderPath):
		for fileName in fileNames:
			if not fileName.endswith('.rtf'):
				continue
			filePath = os.path.join(dirPath, fileName)
			print(filePath)
			try:
				doc = Rtf15Reader.read(open(filePath))
				text = PlaintextWriter.write(doc).getvalue()
			except:
				continue
			lines = [line.strip() for line in text.split('\n') if line]
			articleLinesDict, articleStartIndex = {}, 0
			for i, line in enumerate(lines):
				if line.startswith('Document ') and len(line.split(' ')) == 2:
					articleId = line.split(' ')[-1]
					articleLinesDict[articleId] = lines[articleStartIndex : i]
					articleStartIndex = i + 1

			for articleId, lines in articleLinesDict.iteritems():
				bylineIndex, wordCountIndex, textStartIndex = -1, -1, -1
				for i, line in enumerate(lines):
					line = line.lower()
					if line.startswith('by '):
						bylineIndex = i
					elif line.endswith(' words'):
						wordCountIndex = i
					elif line == 'english':
						textStartIndex = i + 2

				if wordCountIndex == -1 or textStartIndex == -1 or wordCountIndex > textStartIndex:
					print(filePath + ', ' + articleId)
				else:
					articleDict = {'_id': articleId,
					               'filePath' : filePath.split('Marshall_RA/')[-1],
					               'headline': ' '.join(lines[: wordCountIndex]) if bylineIndex == -1 else ' '.join(lines[: bylineIndex]),
					               'byline' : '' if bylineIndex == -1 else lines[bylineIndex],
					               'date' : parser.parse(lines[wordCountIndex + 1]),
					               'sourceName' : lines[wordCountIndex + 2] if lines[wordCountIndex + 2].find(' AM') == -1 and lines[wordCountIndex + 2].find(' PM') == -1 else lines[wordCountIndex + 3],
					               'leadParagraph' : '',
					               'tailParagraph' : '\n'.join(lines[textStartIndex:]),
					               'sourceCode' : '', 'industry' : [], 'region' : [], 'newsSubject' : [], 'company' : []}
					db.saveArticle(articleDict)
예제 #38
0
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp)
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False
    temp.close()
    return xhtml
예제 #39
0
def reviewfile(path, filename):
    document_object = Rtf15Reader.read(open(path + filename, "rb"))
    # string and list objects relating from transcript
    transcriptstring, transcriptnopunctuation = FileProcessor(document_object)
    # find legend start and end which are used to distinguish between speakers
    # section and body of transcript
    legendstart, legendend, legendendmarker = FindLegend(transcriptstring)
    # extract executive, analyst and operator strings used to idenfity speakers
    executiveslist, analystslist, operator =\
        FindSpeakers(transcriptstring, legendendmarker)
    # Next cut transcript string into segments by speaker
    commentlist, aggregatecommentlist =\
        OrganizeTranscriptBySpeaker(transcriptstring, executiveslist,
                                    analystslist, operator, filename,
                                    legendend)
    # Analyze transcript
    # orderedworddict =\
    #     AnalyzeTranscript(transcriptnopunctuation)
    personwordcounts = AnalyzeComments(aggregatecommentlist)
    personobjects = []
    for person in personwordcounts:
        isexecutive = person[0] in executiveslist
        isanalyst = person[0] in analystslist
        personobject =\
            Speaker(
                person[0], filename, isexecutive, isanalyst,
                person[1], person[2]
            )
        personobjects.append(personobject)
    # personobjects = map()
    # print "AGGREGATE COMMENTS ORGANIZED BY EACH PERSON"
    # for person, count, comment in personwordcounts:
    #     print "Speaker:"
    #     print person
    #     print ""
    #     print "Word Count:"
    #     print count
    #     print ""
    #     print "All Comments of Speaker:"
    #     print comment
    #     print ""
    #     print ""
    #     print ""
    return personobjects
예제 #40
0
def documentToText(path):
    if path[-4:] == ".doc":
        cmd = ['antiword', path]
        p = Popen(cmd, stdout=PIPE)
        stdout, stderr = p.communicate()
        return removeNonAscii(stdout)
    elif path[-5:] == ".docx":
        return removeNonAscii(doc.process(path))
    elif path[-4:] == ".txt":
        inputFile = open(path)
        text = inputFile.read() #Because memory and such
        inputFile.close()
        return(removeNonAscii(text))
    elif path[-4:] == ".pdf":
        return removeNonAscii(convert_pdf_to_txt(path))
    elif path[-4:] == ".rtf":
        text = Rtf15Reader.read(open(path))
        return removeNonAscii(PlaintextWriter.write(text).getvalue())
    return "Returned Nothing."
예제 #41
0
 def download_text(self):
     filename = self.link[33:] + "." + self.typ
     try:
         with requests.get(self.link, stream=True) as r:
             with open(filename, 'wb') as f:
                 shutil.copyfileobj(r.raw, f)
     except:
         print("Error downloading " + self.link)
     text = ""
     if self.typ == "pdf":
         try:
             with pdfplumber.open(filename) as pdf:
                 for page in pdf.pages:
                     text += page.extract_text()
         except:
             try:
                 text += textract.process(filename,
                                          method="tesseract",
                                          language="rus").decode("utf-8")
             except:
                 print("Error extracting " + filename)
     elif self.typ == "doc":
         try:
             text += docx2txt.process(filename)
         except:
             try:
                 output = filename[:-3] + "txt"
                 os.system("antiword {} > {}".format(filename, output))
                 with open(output) as f:
                     text += f.read()
                 os.remove(output)
             except:
                 print("Error extracting " + filename)
     elif self.typ == "rtf":
         try:
             doc = Rtf15Reader.read(open(filename, "rb"))
             text += html2text.html2text(
                 XHTMLWriter.write(doc, pretty=True).read().decode("utf-8"))
         except:
             print("Error extracting " + filename)
     if os.path.exists(filename):
         os.remove(filename)
     self.text = text
예제 #42
0
def Run(journal_file):
  raw_entries = plistlib.readPlist(journal_file)

  acc = utils.EntryAccumulator(lambda x: x['date'])
  for k, v in raw_entries.iteritems():
    if not v: continue
    # 12/29/2001 -> 2001-12-29
    new_k = re.sub(r'(\d\d)/(\d\d)/(\d\d\d\d)', r'\3-\1-\2', k)
    d = parser.parse(new_k)

    if isinstance(v, plistlib.Data):
      f = StringIO.StringIO(v.data)
      try:
        doc = Rtf15Reader.read(f)
      except ValueError as e:
        print v.data
        raise e
      txt = PlaintextWriter.write(doc).getvalue()
      acc.add({
        'date': d,
        'rtf': v.data,
        'text': txt
      })
    else:
      acc.add({
        'date': d,
        'text': v
      })

  for day, entries in acc.iteritems():
    assert len(entries) == 1
    entry = entries[0]

    if not entry['text']:
      continue

    summary = utils.SummarizeText(entry['text'])
    utils.WriteSingleSummary(day, maker='osxapp', summary=summary, dry_run=dry_run)
    if 'rtf' in entry:
      utils.WriteOriginal(day, maker='osxapp', contents=entry['rtf'], filename='journal.rtf', dry_run=dry_run)
    else:
      utils.WriteOriginal(day, maker='osxapp', contents=entry['text'].encode('utf8'), filename='journal.txt', dry_run=dry_run)
예제 #43
0
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp, errors='ignore')
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False

        logger.exception('Failed reading rtf from {0}'.format(url))

    temp.close()
    return xhtml
예제 #44
0
def rtf(url):
    '''
    gets the url of the rtf file, and (tries to) return an xhtml version of it.
    returns False if couldn't convert.
    '''
    remote = urlopen(url)
    data = remote.read()
    remote.close()
    temp = TemporaryFile()
    temp.write(data)
    temp.seek(0)
    try:
        doc = Rtf15Reader.read(temp)
        xhtml = XHTMLWriter.write(doc, pretty=True).read()
    except:
        xhtml = False
        exceptionType, exceptionValue, exceptionTraceback = sys.exc_info()
        logger.warn(''.join(traceback.format_exception(exceptionType, exceptionValue, exceptionTraceback)))
        
    temp.close()
    return xhtml
예제 #45
0
    def parse(self, downloaded_file):
        logging.info("Parsing AAMS Block list")
        from pyth.plugins.rtf15.reader import Rtf15Reader, Group

        def _handle_ansi_escape(self, code):
            try:
                Group._handle_ansi_escape(self, code)
            except:
                self.content.append(" ")
        Group.handle_ansi_escape = _handle_ansi_escape

        doc = Rtf15Reader.read(downloaded_file)
        doc.content[0].content
        siti = doc.content[0].content[3].content[0]
        for sito in siti.split("\n"):
            m = re.search("(\d+)(.*)", sito)
            if m:
                url = m.group(2)
                yield {
                    "url": url,
                }
예제 #46
0
def handle_files():
    '''
    The main function to start processing the rtf files
    into csv
    '''
    file_prefix = "old_committee-meetings-protocols"
    for file_name in glob2.glob(protocol_dir):
        if file_prefix in file_name:
            doc = Rtf15Reader.read(open(file_name))
            data = PlaintextWriter.write(doc).getvalue()
            data = data.split(':')
            for leg in data:
                if "מוזמנים" in leg:
                    index_visitor = data.index(leg) + 1
                    name = ({'header': 'מוזמנים', 'body': data[index_visitor]})
                    dir_file = file_name.replace('.rtf', '.csv')
                    with open(dir_file, 'w') as f:
                        w = csv.DictWriter(f, name.keys())
                        w.writeheader()
                        w.writerow(name)
                        break
예제 #47
0
파일: rtf2rst.py 프로젝트: cdjc/theoreST
 def _readin_rtf(self):
     doc = Rtf15Reader.read(open(self.fname, 'r'))
 
     self.doc_text = []
     self.doc_props = []
     
     for i,element in enumerate(doc.content):
         if hasattr(element,"content"):
             if len(element.content) == 0:
                 self.doc_text.append('') # paragraph
                 self.doc_props.append([])
             for text in element.content:
                 if not isinstance(text, pyth.document.Text):
                     if isinstance(text, pyth.document.ListEntry):
                         continue
                     print("### Unknown paragraph element", text, file=sys.stderr)
                     sys.exit(1)
                 #print text.content[0]
                 self.doc_text.append(text.content[0])
                 self.doc_props.append(list(text.properties.keys()))
     self._formatting_fix_ups()
예제 #48
0
    def parse(self, downloaded_file):
        logging.info("Parsing AAMS Block list")
        from pyth.plugins.rtf15.reader import Rtf15Reader, Group

        def _handle_ansi_escape(self, code):
            try:
                Group._handle_ansi_escape(self, code)
            except:
                self.content.append(" ")

        Group.handle_ansi_escape = _handle_ansi_escape

        doc = Rtf15Reader.read(downloaded_file)
        doc.content[0].content
        siti = doc.content[0].content[3].content[0]
        for sito in siti.split("\n"):
            m = re.search("(\d+)(.*)", sito)
            if m:
                url = m.group(2)
                yield {
                    "url": url,
                }
예제 #49
0
    def GET(self, day):
        out = StringIO()
        out.write("""<html>
<head>
<link rel="stylesheet" href="/static/viewer.css" />
<script src="/static/jquery-1.7.min.js"></script>
</head>
<body>
<div id="oneday">
""")
        data = utils.GetOneDay(datetime.strptime(day, '%Y/%m/%d').date())
        for maker in sorted(data.keys()):
            out.write('<h2>%s</h2>\n' % maker)
            # TODO(danvk): include URL, thumbnail if available.
            out.write('<p>%s</p>\n' %
                      data[maker]['summary']['summary'].encode('utf8'))

            if 'originals' in data[maker]:
                originals = data[maker]['originals']
                for filename in sorted(originals.keys()):
                    out.write('<h3>%s</h3>\n' % filename)
                    _, ext = os.path.splitext(filename)
                    if ext == '.txt':
                        out.write('<pre>%s</pre>\n' % originals[filename])
                    elif ext == '.html':
                        out.write(originals[filename])
                    elif ext == '.rtf':
                        f = StringIO(originals[filename])
                        doc = Rtf15Reader.read(f)
                        html = XHTMLWriter.write(doc).getvalue()
                        out.write(html)
                    else:
                        out.write('<p>(Unknown format "%s")</p>' % ext)

            out.write('<hr/>\n')

        out.write('</div></body></html>')
        return out.getvalue()
def get_one_month_from_rtf(url):
    rtf_file = urllib2.urlopen(url)
    rtf_file = StringIO(rtf_file.read())
    doc = Rtf15Reader.read(rtf_file)

    final_data = []

    header = False
    for c in doc.content:
        full_p = c.content.__repr__().lower()
        if "capacity" in full_p and "use cna" in full_p:

            header = True
            continue

        if header:
            row = re.split(r"\t", c.content[0].content[0])
            if len(row) == 7:
                final_data.append(row)

    df = pd.DataFrame(final_data,
                      columns=[
                          "prison_name", "baseline_cna", "in_use_cna",
                          "operational_capacity", "population",
                          "perc_pop_to_used_cna", "perc_acc_available"
                      ])

    df.iloc[:, 1:] = df.iloc[:, 1:].replace("%", "",
                                            regex=True).replace(",",
                                                                "",
                                                                regex=True)

    for c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='ignore')

    cols = [c for c in df.columns if "perc" in c]
    df.loc[:, cols] = df.loc[:, cols] / 100
    return df
예제 #51
0
    def testmethod(self):  # the test method to be added
        inputfilename = os.path.join(rtfinputsdir, basename+".rtf")
        outputfilename = os.path.join(testoutputdir, 
                                      "%s.%s" % (basename, writer))
        #--- obtain reference output or skip test:
        with open(referencefilename, "rb") as input:
            the_referenceoutput = input.read()
        #--- read and convert RTF:
        with open(inputfilename, "rb") as input:
            document = Rtf15Reader.read(input)
        if writer == 'html':
            the_testoutput = XHTMLWriter.write(document, pretty=True).read()
            write_html_file(outputfilename, the_testoutput, print_msg=False)
        elif writer == 'txt':
            with open(outputfilename, "wt") as f:
                PlaintextWriter.write(document, f)

        #--- compute test output:
        with open(outputfilename, "rb") as input:
            the_testoutput = input.read()
        #--- check outcome:
        if the_testoutput == the_referenceoutput:
            os.remove(outputfilename)  # assert will succeed, so it is no longer needed
        self.assertEqual(the_testoutput, the_referenceoutput)
예제 #52
0
def convertRtfToText(path):
    doc = Rtf15Reader.read(open(path))
    return PlaintextWriter.write(doc).getvalue()
예제 #53
0
파일: tasks.py 프로젝트: sahana/eden-stable
    def document_create_index(document, user_id=None):

        import os
        from xlrd import open_workbook
        from pyth.plugins.rtf15.reader import Rtf15Reader
        from pyth.plugins.plaintext.writer import PlaintextWriter
        import sunburnt

        document = json.loads(document)
        table = s3db.doc_document
        id = document["id"]

        name = document["name"]
        filename = document["filename"]

        filename = "%s/%s/uploads/%s" % (os.path.abspath("applications"), \
                                        request.application, filename)

        si = sunburnt.SolrInterface(settings.get_base_solr_url())

        extension = os.path.splitext(filename)[1][1:]

        if extension == "pdf":
            data = os.popen("pdf2txt.py " + filename).read()
        elif extension == "doc":
            data = os.popen("antiword " + filename).read()
        elif extension == "xls":
            wb = open_workbook(filename)
            data = " "
            for s in wb.sheets():
                for row in range(s.nrows):
                    values = []
                    for col in range(s.ncols):
                        values.append(str(s.cell(row, col).value))
                    data = data + ",".join(values) + "\n"
        elif extension == "rtf":
            doct = Rtf15Reader.read(open(filename))
            data = PlaintextWriter.write(doct).getvalue()
        else:
            data = os.popen("strings " + filename).read()

        # The text needs to be in unicode or ascii, with no contol characters
        data = str(unicode(data, errors="ignore"))
        data = "".join(c if ord(c) >= 32 else " " for c in data)

        # Put the data according to the Multiple Fields
        # @ToDo: Also, would change this according to requirement of Eden
        document = {
            "id": str(id),  # doc_document.id
            "name": data,  # the data of the file
            "url": filename,  # the encoded file name stored in uploads/
            "filename": name,  # the filename actually uploaded by the user
            "filetype": extension  # x.pdf -> pdf is the extension of the file
        }

        # Add and commit Indices
        si.add(document)
        si.commit()
        # After Indexing, set the value for has_been_indexed to True in the database
        db(table.id == id).update(has_been_indexed=True)

        db.commit()
예제 #54
0
 def test(self):
     # Just make sure they don't crash, for now
     Rtf15Reader.read(open(path, "rb"))
예제 #55
0
def rtf(from_file, to_txt, opts):
    doc = Rtf15Reader.read(open(from_file.path, "rb"))
    text = PlaintextWriter.write(doc).getvalue()
    return save_raw_data(to_txt.path, text)
예제 #56
0
def main():
    y = 0
    contains = False
    for a in range(1, 7):
        list = []
        dev = '{}{}'.format("dev", a)
        root = api.path + '/{}/interviews/'.format(dev)
        dirlist = [
            item for item in os.listdir(root)
            if os.path.isfile(os.path.join(root, item))
        ]

        for i in dirlist:
            if i.endswith('.rtf'):
                x = 0
                print root + i
                doc = PlaintextWriter.write(Rtf15Reader.read(
                    open(root + i))).getvalue()
                first_index = doc.find("Clip Transcript") + 21
                second_index = doc.find("Clip Keywords") - 19
                interview_date = doc[first_index - 35:first_index -
                                     25].replace("_", "-")
                clip_transcript = doc[first_index:second_index]
                list.append([interview_date, clip_transcript])

                while x < doc.count("Clip Transcript"):

                    if doc.find("Clip Transcript", second_index) > 0:
                        first_index = doc.find("Clip Transcript",
                                               second_index) + 21
                    else:
                        break
                    if doc.find("Clip Keywords", first_index) > 0:
                        second_index = doc.find("Clip Keywords",
                                                first_index) - 16
                    else:
                        break

                    interview_date = doc[first_index - 35:first_index -
                                         25].replace("_", "-")
                    clip_transcript = doc[first_index:second_index]

                    print interview_date, clip_transcript

                    for sublist in list:
                        if sublist[0] == interview_date:
                            sublist[1] = sublist[
                                1] + "\n------------------\n" + clip_transcript
                            contains = True
                    if not contains:
                        list.append([interview_date, clip_transcript])
                    contains = False

                    x = x + 1

        print list
        # print list[0][0], list[0][1]

        for i in dirlist:
            if i.endswith('.mp3'):

                tag = i[:10].replace("_", "-")

                index = 0
                for sublist in list:
                    if tag.strip() == sublist[0].strip():
                        description = unicode(list[index][1], errors="ignore")
                        contains = True
                    index = index + 1
                if not contains:
                    description = ""
                contains = False

                data_audio = {
                    "description":
                    description,
                    "duration":
                    (MP3(api.path +
                         '/{}/interviews/{}'.format(dev, i))).info.length,
                    "id":
                    "",
                    "interview":
                    api.get('interviews', y + 1),
                    "status":
                    "PRIVATE",
                    "tag":
                    i,
                    "uri":
                    "http://opendata.soccerlab.polymtl.ca/audios/" + i,
                    "author":
                    "",
                    "license":
                    ""
                }
                api.request("Audio", 'audios', data_audio)
                y = y + 1
예제 #57
0
def parse():
    committeeIds = {}
    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            committeeIds[year] = set()

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                for row in csv.reader(csvFile):
                    committeeId = row[8]
                    if row[6] == '2' and committeeId != '0':
                        committeeIds[year].add(committeeId)
    dictionary = {}
    with open('./log.txt', 'w+') as outputFile:
        for rtfDir, _, rtfFiles in os.walk('./rtf'):
            for fileName in rtfFiles:
                if not fileName.endswith('.rtf'):
                    continue

                year = fileName[:4]
                if year not in dictionary:
                    dictionary[year] = {}

                with open(os.path.join(rtfDir, fileName)) as rtfFile:
                    parsedName = re.findall(u'\d+', fileName)
                    date, meetingId = str(
                        '/'.join(parsedName[0:3][::-1])
                    ), parsedName[-1] if len(parsedName) > 3 else '00'
                    try:
                        doc = Rtf15Reader.read(rtfFile)
                    except Exception:
                        continue

                    for line in PlaintextWriter.write(doc):
                        line = unicode(line, encoding='utf-8')
                        if len(line) < 95 and re.match(approvedLine, line):
                            res = list(
                                set(re.findall(u'\d+', line))
                                & committeeIds[year])
                            if len(res) > 0:
                                outputFile.write(fileName + ': ' +
                                                 line.encode('utf-8'))
                                for requestId in res:
                                    dictionary[year][requestId] = [
                                        date, meetingId
                                    ]

    for csvDir, _, csvFiles in os.walk('./csv'):
        for csvFileName in csvFiles:
            if not re.match('\w+\d{4}\.csv$', csvFileName):
                continue

            year = csvFileName[7:11]
            if not year in dictionary:
                continue

            with open(os.path.join(csvDir, csvFileName)) as csvFile:
                with open(
                        os.path.join(
                            csvDir,
                            csvFileName[:-4] + '_out' + csvFileName[-4:]),
                        'w+') as outputCsv:
                    writer = csv.writer(outputCsv)
                    reader = csv.reader(csvFile)
                    writer.writerow(reader.next() + headerColumns)
                    for row in csv.reader(csvFile):
                        committeeId = '' if len(row) < 9 else row[8]
                        writer.writerow(row + (
                            ['', ''] if committeeId not in dictionary[year]
                            else dictionary[year][committeeId]))
예제 #58
0
from pyth.plugins.rtf15.reader import Rtf15Reader
from pyth.plugins.xhtml.writer import XHTMLWriter

import sys

if len(sys.argv) > 1:
    filename = sys.argv[1]
else:
    filename = "sample.rtf"

doc = Rtf15Reader.read(open(filename, "rb"))

print XHTMLWriter.write(doc, pretty=True).read()