示例#1
0
文件: head.py 项目: ywangd/stash
def main(args):
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("-n", "--lines",default=10, type=int,
                   help="""print the first K lines instead of 10;
                   if negative, print the last -K lines""")
    p.add_argument("-q", "--quiet", "--silent", action='store_true',
                   help="never print headers for each file")
    p.add_argument("-v", "--verbose", action='store_true',
                   help="always print headers for each file")
    p.add_argument("files", action="store", nargs="*",
                   help="files to print")
    ns = p.parse_args(args)

    status = 0

    header_fmt = '==> {} <==\n'

    if len(ns.files) == 0:
        ns.files = ['-']

    try:
        for fname in ns.files:
            if ns.verbose or (len(ns.files) > 1 and not ns.quiet):
                if fname == '-':
                    print(header_fmt.format('standard input'), end='')
                else:
                    print(header_fmt.format(fname), end='')

            fileinput.close()
            inp = fileinput.input(fname, openhook=fileinput.hook_encoded("utf-8"))
            if ns.lines >= 0:
                buf = []
                for i, line in enumerate(inp):
                    if i >= ns.lines:
                        break
                    buf.append(line)
                for line in buf:
                    print(line, end='')
            else:
                buf = []
                for line in fileinput.input(inp, openhook=fileinput.hook_encoded("utf-8")):
                    buf.append(line)
                    if len(buf) > -ns.lines:
                        del buf[0]
                for line in buf:
                    print(line, end='')

    except Exception as e:
        print('head :%s' % str(e))
        status = 1
    finally:
        fileinput.close()

    sys.exit(status)
def main():
    parser = arg_parser()
    args = parser.parse_args()

    dir = args.onionshare_dir

    src = files_in(dir, 'onionshare') + files_in(dir, 'onionshare_gui')
    pysrc = [p for p in src if p.endswith('.py')]
    htmlsrc = [p for p in src if p.endswith('.html')]

    translate_keys = set()
    # load translate key from python source
    for line in fileinput.input(pysrc, openhook=fileinput.hook_encoded('utf-8')):
        # search `strings._('translate_key')`
        #        `strings._('translate_key', True)`
        m = re.search(r'strings\._\((.*?)\)', line)
        if m:
            arg = m.group(1)
            key = arg.split(',')[0].strip('''"' ''')
            translate_keys.add(key)

    # load translate key from html source
    for line in fileinput.input(htmlsrc, openhook=fileinput.hook_encoded('utf-8')):
        # search `{{strings.translate_key}}`
        m = re.search(r'{{.*strings\.([-a-zA-Z0-9_]+).*}}', line)
        if m:
            key = m.group(1)
            translate_keys.add(key)


    if args.show_all_keys:
        for k in sorted(translate_keys):
            print k
        sys.exit()


    locale_files = [f for f in files_in(dir, 'locale') if f.endswith('.json')]
    for locale_file in locale_files:
        with codecs.open(locale_file, 'r', encoding='utf-8') as f:
            trans = json.load(f)
        # trans -> {"key1": "translate-text1", "key2": "translate-text2", ...}
        locale_keys = set(trans.keys())

        disused = locale_keys - translate_keys
        lacked = translate_keys - locale_keys

        locale, ext = os.path.splitext(os.path.basename(locale_file))
        for k in sorted(disused):
            print locale, 'disused', k

        for k in sorted(lacked):
            print locale, 'lacked', k
示例#3
0
 def test_file_opening_hook(self):
     try:
         # cannot use openhook and inplace mode
         fi = FileInput(inplace=1, openhook=lambda f,m: None)
         self.fail("FileInput should raise if both inplace "
                          "and openhook arguments are given")
     except ValueError:
         pass
     try:
         fi = FileInput(openhook=1)
         self.fail("FileInput should check openhook for being callable")
     except ValueError:
         pass
     if due_to_ironpython_incompatibility("functionality in cpython site.py"): 
         # without it, lookup('rot13') will fail due to lack of search functions
         # which was registered in encodings\__init__.py
         import encodings
     if not due_to_ironpython_bug('http://tkbgitvstfat01:8080/WorkItemTracking/WorkItem.aspx?artifactMoniker=148925'): 
         try:
             t1 = writeTmp(1, ["A\nB"], mode="wb")
             fi = FileInput(files=t1, openhook=hook_encoded("rot13"))
             lines = list(fi)
             self.assertEqual(lines, ["N\n", "O"])
         finally:
             remove_tempfiles(t1)
示例#4
0
文件: sort.py 项目: ywangd/stash
def main(args):
    ap = argparse.ArgumentParser()
    ap.add_argument('files', nargs='*', help='files to sort')
    ap.add_argument('-r', '--reverse', action='store_true', default=False,
                    help='reverse the result of comparisons')
    ns = ap.parse_args(args)

    def _print(lines):
        if lines is not None:
            lines = sorted(lines)
            if ns.reverse:
                lines = lines[::-1]
            print(''.join(lines))

    fileinput.close()  # in case it is not closed
    try:
        lines = None
        for line in fileinput.input(ns.files, openhook=fileinput.hook_encoded("utf-8")):
            if fileinput.isfirstline():
                _print(lines)
                lines = []
            lines.append(line)

        _print(lines)

    finally:
        fileinput.close()
    def get_preprocessed_text(self, limit=None):
        """
        Generator generates preprocessed list of tokenized words on every call.

        - Read Sentence tokenized intermediate preprocessed file.
        - Tokenize and preprocess words, return list of words from a sentence.
        """
        count = 0
        if limit is None:
            limit = self.limit
        for sentence in fileinput.input(
            files=[self.preprocessed_corpus_path],
            openhook=fileinput.hook_encoded(self.encoding)
        ):
            word_list = itertools.chain(*(
                self._clean_word(
                    word
                ) for word in self._tokenize_words(sentence)
            ))
            word_list = [
                word for word in word_list if len(word) is not 0
            ]
            count += len(word_list)
            if limit is not None and count >= limit:
                fileinput.close()
                raise StopIteration
            else:
                yield word_list
def parseTag(inputfilename, outputfilename, searchExp):
  fin = fileinput.input(inputfilename, inplace = 0, openhook = fileinput.hook_encoded(fileencoding))
  fout = codecs.open(outputfilename, "w", fileencoding)
  isblock = 0
  for line in fin:
    newline = line
    isfirst = searchExp in line
    islast = "\tMedium;" in line
    issingleline = isfirst and islast # and "," in line
    fixquotes = 0

    if issingleline:
      fixquotes = "\t" in extractThirdField(line) # If there is a comma on the third fild, quote it!
      if fixquotes:
	newline = leftQuoteThirdField(line)
	newline = rightQuoteThirdField(newline)
	print "%d: %s" % (fileinput.filelineno(), newline)
#	print "%d:(issingle):%s" % (fileinput.filelineno(), newline)

    if (not issingleline) and (isfirst and not islast):
      #newline = reverseReplace(line, searchExp, searchExp + '"', 1)
      newline = leftQuoteThirdField(line)
      print "quoting left"
      isblock = 1
    if (not issingleline) and (not isfirst and islast and isblock):
      newline = reverseReplace(line, "\tMedium;", '"' + "\tMedium;", 1)
      print "quoting right"
      isblock = 0
    #TODO: Fix the single line comma bug
    fout.write(newline)
    if issingleline:
      print "%d: %s" % (fileinput.filelineno(), newline)
  fout.close()
def run(target_dir, inplaceFlag=0):
    global showMessageCount, alreadyChanged, showMessageChanged
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            if file.endswith(".jsp") and (file.lower() in teoconstants.jsps):
                if inplaceFlag == 0:  # improve performance
                    f = fileinput.input(
                        root + "\\" + file, inplace=inplaceFlag, openhook=fileinput.hook_encoded("utf-8")
                    )
                elif inplaceFlag == 1:
                    f = fileinput.input(root + "\\" + file, inplace=inplaceFlag)

                for i, line in enumerate(f):
                    if re.search("posui:showImageButtons", line, re.IGNORECASE):
                        showMessageCount += 1
                        if re.search("isMultiLang", line):
                            alreadyChanged += 1
                        else:
                            showMessageChanged += 1
                            line = line.replace("posui:showMessage", 'posui:showMessage isMultiLang="true"')

                            if inplaceFlag == 0:
                                sys.stdout.write(file + " : " + line)
                    if inplaceFlag == 1:
                        sys.stdout.write(line)

                f.close()
示例#8
0
def uniq_files(path_list):
    res = set()
    for line in fileinput.FileInput(path_list,
                                    openhook=fileinput.hook_encoded(
                                            "utf-8")):
        res.add(line.strip())
    return sorted(list(res))
def read_csv_file(source_path_param):
    list1 = []
    for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")):
        new_line_index = line.find('\n')
        # print(new_line_index)
        list1.append(line[:new_line_index])
    fileinput.close()
    return list1
示例#10
0
def read_channelconfig(config_file):
    channels = dict()
    fileinput.hook_encoded("utf-8")
    file = open(config_file, mode='r', encoding='utf-8')
    while True:
        line = file.readline().lstrip().rstrip('\n')
        if line is None or line == '':
            break
        if not line.startswith("#"):
            regex = re.compile('\[([^\]]*)\]\s*([A-Za-z0-9_]+)')
            match = regex.match(line)
            if match:
                channelname = match.group(1)
                channelvalue = match.group(2)
                channel_key = channelname if not (channelname == '' or channelname is None) else channelvalue
                channels["{channelName}".format(channelName=channel_key)] = channelvalue
            del match
            del regex
    return channels
示例#11
0
文件: pbcopy.py 项目: ywangd/stash
def main(args):
    ap = argparse.ArgumentParser()
    ap.add_argument('file', nargs='*', help='one or more files to be copied')
    ns = ap.parse_args(args)
    
    fileinput.close() # in case it is not closed
    try:
        clipboard.set(''.join(line for line in fileinput.input(ns.file, openhook=fileinput.hook_encoded("utf-8"))))
    except Exception as err:
        print("pbcopy: {}: {!s}".format(type(err).__name__, err), file=sys.stderr)
    finally:
        fileinput.close()
示例#12
0
 def findStringInListOfFiles(self, source_dir, fileList, string, regex=0):
     for root, dirs, files in os.walk(source_dir):
         for file in files:
             if file.lower() in fileList:
                 f = fileinput.input(root+"\\"+ file, inplace=0, openhook=fileinput.hook_encoded('utf-8'))
                 for i, line in enumerate(f):
                     if regex == 0:
                         if line.find(string) != -1:
                             sys.stdout.write(file +':'+str(i)+line)
                     else:
                         if re.search(string, line):
                             sys.stdout.write(file +':'+str(i)+'\t'+line)
示例#13
0
文件: grep.py 项目: ywangd/stash
def main(args):
    global _stash
    ap = argparse.ArgumentParser()
    ap.add_argument('pattern', help='the pattern to match')
    ap.add_argument('files', nargs='*', help='files to be searched')
    ap.add_argument('-i', '--ignore-case', action='store_true',
                    help='ignore case while searching')
    ap.add_argument('-v', '--invert', action='store_true',
                    help='invert the search result')
    ap.add_argument('-c', '--count', action='store_true',
                    help='count the search results instead of normal output')
    ns = ap.parse_args(args)

    flags = 0
    if ns.ignore_case:
        flags |= re.IGNORECASE

    pattern = re.compile(ns.pattern, flags=flags)

    # Do not try to grep directories
    files = [f for f in ns.files if not os.path.isdir(f)]

    fileinput.close()  # in case it is not closed
    try:
        counts = collections.defaultdict(int)
        for line in fileinput.input(files, openhook=fileinput.hook_encoded("utf-8")):
            if bool(pattern.search(line)) != ns.invert:
              if ns.count:
                    counts[fileinput.filename()] += 1
              else:
                if ns.invert: # optimize: if ns.invert, then no match, so no highlight color needed
                    newline = line
                else:
                    newline = re.sub(pattern, lambda m: _stash.text_color(m.group(), 'red'), line)
                if fileinput.isstdin():
                    fmt = u'{lineno}: {line}'
                else:
                    fmt = u'{filename}: {lineno}: {line}'

                print(fmt.format(filename=fileinput.filename(),
                                 lineno=fileinput.filelineno(),
                                 line=newline.rstrip()))
                
        if ns.count:
            for filename, count in counts.items():
                fmt = u'{count:6} {filename}'
                print(fmt.format(filename=filename, count=count))
                
    except Exception as err:
        print("grep: {}: {!s}".format(type(err).__name__, err), file=sys.stderr)
    finally:
        fileinput.close()
示例#14
0
def main():
    parser = arg_parser()
    args = parser.parse_args()

    dir = args.onionshare_dir

    src = files_in(dir, 'onionshare') + \
          files_in(dir, 'onionshare_gui') + \
          files_in(dir, 'onionshare_gui/mode') + \
          files_in(dir, 'onionshare_gui/mode/share_mode') + \
          files_in(dir, 'onionshare_gui/mode/receive_mode') + \
          files_in(dir, 'install/scripts') + \
          files_in(dir, 'tests')
    pysrc = [p for p in src if p.endswith('.py')]

    lang_code = args.lang_code

    translate_keys = set()
    # load translate key from python source
    for line in fileinput.input(pysrc, openhook=fileinput.hook_encoded('utf-8')):
        # search `strings._('translate_key')`
        #        `strings._('translate_key', True)`
        m = re.findall(r'strings\._\((.*?)\)', line)
        if m:
            for match in m:
                key = match.split(',')[0].strip('''"' ''')
                translate_keys.add(key)

    if args.show_all_keys:
        for k in sorted(translate_keys):
            print(k)
        sys.exit()

    if lang_code == 'all':
        locale_files = [f for f in files_in(dir, 'share/locale') if f.endswith('.json')]
    else:
        locale_files = [f for f in files_in(dir, 'share/locale') if f.endswith('%s.json' % lang_code)]
    for locale_file in locale_files:
        with codecs.open(locale_file, 'r', encoding='utf-8') as f:
            trans = json.load(f)
        # trans -> {"key1": "translate-text1", "key2": "translate-text2", ...}
        locale_keys = set(trans.keys())

        disused = locale_keys - translate_keys
        lacked = translate_keys - locale_keys

        locale, ext = os.path.splitext(os.path.basename(locale_file))
        for k in sorted(disused):
            print(locale, 'disused', k)

        for k in sorted(lacked):
            print(locale, 'lacked', k)
示例#15
0
文件: mass.py 项目: sornars/mass
def func(input_dir, file_reg, func, args=None, *, encoding='utf-8'):
    """Run func for every line in the files matching file_reg in input_dir."""
    with fileinput.input(glob.glob(input_dir + file_reg),
                         openhook=fileinput.hook_encoded(encoding)) as f:
        for line in f:
            try:
                if args:
                    func(line, *args)
                else:
                    func(line)
            except Exception:
                print(line)
                raise
示例#16
0
文件: reader.py 项目: khris/spamipsum
def _get_word_from_file(path):
    first = True
    for root, dirs, files in os.walk(path):
        with FileInput(files=_get_full_paths(root, files),
                       openhook=fileinput.hook_encoded(DEFAULT_ENCODING)) as f:
            for line in f:
                if f.isfirstline():
                    if first:
                        first = False
                    else:
                        yield EOD()
                for word in line.split():
                    yield word
    yield EOD()
示例#17
0
def Convert(description=u'', output=u'sys.stdout', input=u'sys.stdin'):
    """The main loop routine in charge to read the data and to report results.

    Args:
        description: todo.
        output: the output channel to be used for the results.
        input: the input channel to be used to feed l2tcsv data.
    """
    cybox_files = {}
    cybox_files_related = {}
    rows = []

    openhook = fileinput.hook_encoded(u'utf8')
    file_in = fileinput.FileInput(input, openhook=openhook)

    try:
        reader = csv.DictReader(file_in, fieldnames=L2TCSV_HEADER)
        # Check if input file or stdin has l2tcsv headers.
        first_row = reader.next()
        if first_row[u'date'] != u'date' and first_row[u'extra'] != u'extra':
            EventToCybox(first_row, cybox_files, cybox_files_related)
        # Process lines, one-step over data without memory.
        for row in reader:
            EventToCybox(row, cybox_files, cybox_files_related)
    except IOError as exception_io:
        logging.error(u'IO error: {0:s}'.format(exception_io))
        return

    observables = cyboxObservables()

    # Actually hard coded.
    tool = cyboxTools.ToolInformation(u'Plaso')
    tool.version = u'1.4.1'
    tool_list = cyboxTools.ToolInformationList()
    tool_list.append(tool)
    observables.observable_package_source = cyboxMeasureSource.MeasureSource()
    observables.observable_package_source.tools = tool_list

    for key, cybox_file in cybox_files.iteritems():
        observables.add(cyboxObservable(cybox_file))

    try:
        if output != u'sys.stdout':
            file_out = open(output, u'w')
        else:
            file_out = sys.stdout
        file_out.write(observables.to_xml().encode(u'utf8'))
    except IOError as exception_io:
        logging.error(u'IO error: {0:s}'.format(exception_io))
def process_data(source_path_param, output, choice):
    for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")):
        value = line.split(',')
        # print(value)
        x = int(value[12].strip())  # hour
        y = 0.0
        if choice == 'retweet':
            y = float(value[3].strip())  # diff_retweet
        elif choice == 'follower_wt_mc':
            y = float(value[10].strip())  # diff_follower_wt_mc
        elif choice == 'follower_wo_mc':
            y = float(value[13].strip())  # diff_follower_wo_mc
        output[x].append(y)
    fileinput.close()
    return
示例#19
0
文件: mass.py 项目: sornars/mass
def concat(input_dir, file_reg, output, *, encoding='utf-8', fltr=None):
    """Concatenate all files matching file_reg in input_dir into output."""
    with fileinput.input(glob.glob(input_dir + file_reg),
                         openhook=fileinput.hook_encoded(encoding)) as f, \
            open(output, 'w', encoding=encoding) as o:
        for line in f:
            try:
                if fltr:
                    if re.search(fltr, line):
                        o.write(line)
                else:
                    o.write(line)
            except Exception:
                print(line)
                raise
示例#20
0
def generate_game(difficulty, size=6, num=1):
    words = []
    for line in fileinput.input(files=dicts[difficulty],openhook=fileinput.hook_encoded("iso-8859-1")):
        word = line.strip().upper()
        if len(word) > 2:
            words.append(word)
    while True:
        scramble = generate_scramble(size)
        letter_count = make_letter_count(scramble)
        soln = []
        for word in words:
            if can_make_word(letter_count, word):
                soln.append(word)
        if len(soln) > 9:
            return (scramble, soln)
示例#21
0
def doReadBigramLattice(filename, bigramLat):
    starttime = time.time()
    ft1 = bigram.BIGRAM_START
    filegen =  fileinput.input(filename, openhook = fileinput.hook_encoded('utf8'))
    for ln in filegen:
        ft2 = readFeatureTag(ln)
        bg = bigram(ft1, ft2)
        bigramLat.addItem(bg)
        ft1 = ft2
    ft2 = bigram.BIGRAM_END
    bg = bigram(ft1, ft2)
    bigramLat.addItem(bg)
    endtime = time.time();
    elapsetime = endtime - starttime
    print("Read {0} file in {1} seconds. {2} items".format(filename, elapsetime, bigramLat.getN()))
    return elapsetime
示例#22
0
    def test_readline(self):
        with open(TESTFN, 'wb') as f:
            f.write(b'A\nB\r\nC\r')
            # Fill TextIOWrapper buffer.
            f.write(b'123456789\n' * 1000)
            # Issue #20501: readline() shouldn't read whole file.
            f.write(b'\x80')
        self.addCleanup(safe_unlink, TESTFN)

        with FileInput(files=TESTFN,
                       openhook=hook_encoded('ascii'), bufsize=8) as fi:
            self.assertEqual(fi.readline(), 'A\n')
            self.assertEqual(fi.readline(), 'B\n')
            self.assertEqual(fi.readline(), 'C\n')
            with self.assertRaises(UnicodeDecodeError):
                # Read to the end of file.
                list(fi)
def extract_diff_ret_or_fol(source_path_param, choose_str):
    list1 = []
    for line in fileinput.input([source_path_param], openhook=fileinput.hook_encoded("utf8")):
        if choose_str == 'retweet':
            diff_ret = line.split(',')[3]
            dot_index = diff_ret.find(".")
            diff_ret_int = int(diff_ret[0:dot_index])
            list1.append(diff_ret_int)  # Normal

        elif choose_str == 'follower_wt_mc':
            diff_fol = float(line.split(',')[10])
            list1.append(diff_fol)  # Normal

        elif choose_str == 'follower_wo_mc':
            diff_fol = float(line.split(',')[13])
            list1.append(diff_fol)  # Normal
    fileinput.close()
    return list1
    def main(self, target_dir):
        for root, dirs, files in os.walk(target_dir):
            for file in files:
                if file.endswith(".jsp") and (file.lower() in teoconstants.uipgms):
                    f = fileinput.input(root + "\\" + file, inplace=0, openhook=fileinput.hook_encoded("utf-8"))

                    for i, line in enumerate(f):
                        iterator = re.finditer(r"^.*(?P<imagename>\b.+\.gif\b)", line, re.IGNORECASE)
                        for match in iterator:
                            ##                            print(file+':'+str(i+1)+'\t'+match.group('imagename'))
                            if match.group("imagename") not in imageList:
                                imageList.append(match.group("imagename"))

                    if imageList:
                        for value in imageList:
                            ##                            print(commonJobs.printFilenameAndPackage(file,root,'public_html.*',r'\.jsp'))
                            print(file + "\t" + value)
                        imageList[:] = []
示例#25
0
文件: more.py 项目: ywangd/stash
def more(filenames, pagesize=10, clear=False, fmt='{line}'):
    '''Display content of filenames pagesize lines at a time (cleared if specified) with format fmt for each output line'''

    fileinput.close() # in case still open
    try:
        pageno = 1
        if clear:
            clear_screen()
        for line in fileinput.input(filenames, openhook=fileinput.hook_encoded("utf-8")):
            lineno, filename, filelineno = fileinput.lineno(), fileinput.filename(), fileinput.filelineno()
            print(fmt.format(**locals()), end='')
            if pagesize and lineno % pagesize == 0:
                console.alert('Abort or continue', filename, 'Next page') # TODO: use less intrusive mechanism than alert
                pageno += 1
                if clear:
                    clear_screen()
    finally:
        fileinput.close()
示例#26
0
    def test_readline(self):
        with open(TESTFN, 'wb') as f:
            f.write('A\nB\r\nC\r')
            # Fill TextIOWrapper buffer.
            f.write('123456789\n' * 1000)
            # Issue #20501: readline() shouldn't read whole file.
            f.write('\x80')
        self.addCleanup(safe_unlink, TESTFN)

        fi = FileInput(files=TESTFN, openhook=hook_encoded('ascii'), bufsize=8)
        # The most likely failure is a UnicodeDecodeError due to the entire
        # file being read when it shouldn't have been.
        self.assertEqual(fi.readline(), u'A\n')
        self.assertEqual(fi.readline(), u'B\r\n')
        self.assertEqual(fi.readline(), u'C\r')
        with self.assertRaises(UnicodeDecodeError):
            # Read to the end of file.
            list(fi)
        fi.close()
示例#27
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', '-o', help='write output to file instead of stdout')
    parser.add_argument('--split', '-s', help='if writing to file, split into multiple files with this many lines per '
                                              'file', type=int, default=0)
    parser.add_argument('--extra-field', '-e', help='extra fields to include. Provide a field name and a pointer to '
                                                    'the field. Example: -e verified user.verified',
                        nargs=2, action='append')
    parser.add_argument('--excel', '-x', help='create file compatible with Excel', action='store_true')
    parser.add_argument('files', metavar='FILE', nargs='*', help='files to read, if empty, stdin is used')
    args = parser.parse_args()

    file_count = 1
    csv_file = None
    if args.output:
        if args.split:
            csv_file = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf-8')
            file_count += 1
        else:
            csv_file = codecs.open(args.output, 'wb', 'utf-8')
    else:
        csv_file = sys.stdout
    sheet = csv.writer(csv_file)

    extra_headings = []
    extra_fields = []
    if args.extra_field:
        for heading, field in args.extra_field:
            extra_headings.append(heading)
            extra_fields.append(field)

    sheet.writerow(get_headings(extra_headings=extra_headings))

    files = args.files if len(args.files) > 0 else ('-',)
    for count, line in enumerate(fileinput.input(files, openhook=fileinput.hook_encoded("utf-8"))):
        if args.split and count and count % args.split == 0:
            csv_file.close()
            csv_file = codecs.open(numbered_filepath(args.output, file_count), 'wb', 'utf-8')
            sheet = csv.writer(csv_file)
            sheet.writerow(get_headings(extra_headings=extra_headings))
            file_count += 1
        tweet = json.loads(line)
        sheet.writerow(get_row(tweet, extra_fields=extra_fields, excel=args.excel))
示例#28
0
 def test_file_opening_hook(self):
     try:
         # cannot use openhook and inplace mode
         fi = FileInput(inplace=1, openhook=lambda f, m: None)
         self.fail("FileInput should raise if both inplace " "and openhook arguments are given")
     except ValueError:
         pass
     try:
         fi = FileInput(openhook=1)
         self.fail("FileInput should check openhook for being callable")
     except ValueError:
         pass
     try:
         t1 = writeTmp(1, ["A\nB"], mode="wb")
         fi = FileInput(files=t1, openhook=hook_encoded("rot13"))
         lines = list(fi)
         self.assertEqual(lines, ["N\n", "O"])
     finally:
         remove_tempfiles(t1)
示例#29
0
def load_data(files):
    """
    Extract zip and process information into CSV's.

    Parameters
    ----------
    files : list of str

    Returns
    -------
    str :
        combined data from files
    """

    log.info('Loading data: %s.' % ', '.join(files))
    raw_data = fileinput.FileInput(
        files=files, openhook=fileinput.hook_encoded('utf-8')
    )
    log.info('Done loading data.')
    return raw_data
def concatenate_files(input_directory, output_file):
    """

    :param input_directory:
    :param output_file:
    :return:
    """
    assert os.path.isdir(input_directory), 'input path should be a directory'

    if not input_directory.endswith('/'):
        input_directory = ''.join((input_directory, '/'))

    if not check_file_exist(output_file):
        file_names = os.listdir(input_directory)
        file_paths = [''.join((input_directory, f_n)) for f_n in file_names]
        with open(output_file, 'w', encoding='utf-8') as out_file:
            in_file = fileinput.input(files=file_paths, openhook=fileinput.hook_encoded('utf-8'))  # python 2.7.10, fileinput doest not have `__exit__` --> cannot use `with`
            for line in in_file:
                out_file.write(line)
            in_file.close()
示例#31
0
def main():

    Topic = []
    Utterance = []
    Relevance = []

    regex = u'[^ぁ-ん]+'

    all_filepaths = glob.glob('./training/*')
    for filepath in all_filepaths:
        lines = [
            line.rstrip() for line in fileinput.input(
                filepath, openhook=fileinput.hook_encoded('utf-8'))
        ]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])

        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Relevance"])

    TrueDataset = {}
    correctAnswer = 0
    for line in list(set(Utterance)):
        T_List = []
        R_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                T_List.append(Topic[line_l])
                R_list.append(Relevance[line_l])
        TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(
            Counter(R_list).most_common()[0][0])

    # Analyze Utterance using Juman++ & knp
    jumanpp = Jumanpp()
    with open("incorrect.txt", "w") as wf:
        line_cnt = len(TrueDataset)
        now_line_cnt = 0
        for key, label in TrueDataset.items():
            tpc, utr = key.split(":")[0], key.split(":")[1]

            #print(tpc + ":" + utr + "[" + label + "]")

            #parse Topic
            topic_analyed_List = []
            try:
                #0.7909880035111675
                #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0]
                #topic_result = jumanpp.analysis(s)
                topic_result = jumanpp.analysis(format_text(tpc))
                #print(s)
                for mrph in topic_result.mrph_list():
                    try:
                        if len(re.findall(regex, mrph.midasi)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                topic_analyed_List.append(mrph.midasi)
                    except:
                        continue
            except:
                #print("Error.",tpc)
                continue

        #parse Utterance
            utter_analyed_List = []
            try:
                utter_result = jumanpp.analysis(utr)
                for mrph in utter_result.mrph_list():
                    try:
                        if len(re.findall(regex, mrph.midasi)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                utter_analyed_List.append(mrph.midasi)
                    except:
                        continue
            except:
                #print("Error.",utr)
                continue

            #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List)))

            if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
                #print("1:",label)
                if int(label) == 1:
                    correctAnswer += 1
                else:
                    wf.write(tpc + ":" + utr + "[" + "1" + ":" + label + "]\n")
            else:
                #print("0:",label)
                if int(label) == 0:
                    correctAnswer += 1
                else:
                    wf.write(tpc + ":" + utr + "[" + "0" + ":" + label + "]\n")
            now_line_cnt += 1
            #print( now_line_cnt, "/", line_cnt)

    print("acurracy:", correctAnswer * 1.0 / len(TrueDataset))
示例#32
0
                    default="utf-8",
                    choices=["utf-8", "utf-8-sig", "utf-16"],
                    help="Input file encoding")
parser.add_argument("-v", action='store_true', default=False)

args = parser.parse_args()

if (args.v):
    print(args)

questions = []

# with open(args.inputfile, 'r', encoding='utf-8') as f:
with fileinput.FileInput(files=args.inputfile,
                         mode='r',
                         openhook=fileinput.hook_encoded(args.encoding)) as f:
    fw = FileWrapper(f)
    qtype = QType.MC
    while True:
        line = fw.readline()
        # print(line)
        if not line:
            break
        if m := re.match(r"Type:\s*(F|FIB|MC|FIB_PLUS|FMB|E|ESS)$", line):
            qtype = Str2QType[m.group(1)]
            if args.v: print("Question type:", t.name)
        elif m := re.match(r"(\d+)\.\s+(.+)", line):
            fw.unreadline(line)
            if qtype == QType.FIB:
                q = FIBQuestion()
                load_question(q, fw, True)
示例#33
0
def main():

    all_filepaths = glob.glob('./training/*')
    #print("frhifr",all_filepaths)

    Topic = []
    Utterance = []
    Relevance = []
    FactCheck = []
    Stance = []

    for filepath in all_filepaths:

        # args = get_args()
        # JSON読み込み
        # src = '-' if not hasattr(args, 'json_file') else args.json_file

        lines = [
            line.rstrip() for line in fileinput.input(
                filepath, openhook=fileinput.hook_encoded('utf-8'))
        ]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])

        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Relevance"])
            FactCheck.append(argument["Fact-checkability"])
            Stance.append(argument["Stance"])

    TrueDataset = []
    for line in list(set(Utterance)):
        cnt = 0
        R_list = []
        F_list = []
        S_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                cnt += 1
                R_list.append(Relevance[line_l])
                F_list.append(FactCheck[line_l])
                S_list.append(Stance[line_l])
        plane = line + " " + str(
            Counter(R_list).most_common()[0][0]) + " " + str(
                Counter(F_list).most_common()[0][0]) + " " + str(
                    Counter(S_list).most_common()[0][0])
        if not ((cnt == 5 and Counter(S_list).most_common()[0][1] == 2) or
                (cnt == 3 and Counter(S_list).most_common()[0][1] == 1)):
            TrueDataset.append(plane)

    # Analyze Utterance using Juman++
    jumanpp = Jumanpp()
    for arguments in TrueDataset:
        #print(argument["Utterance"],argument["Relevance"],argument["Fact-checkability"],argument["Stance"],argument["Class"])
        argument = arguments.split(" ")
        result = jumanpp.analysis(argument[0])
        analyed_argument = ""
        for mrph in result.mrph_list():
            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                analyed_argument += mrph.midasi + " "

        analyed_argument += "\t"
        analyed_argument += argument[1] + "\t"
        analyed_argument += argument[2] + "\t"
        analyed_argument += argument[3]

        print(analyed_argument)
示例#34
0
    def weave(self):
        self.abstract_task_desc.setdefault('extra C files', dict())

        clade = Clade(self.conf['build base'])
        if not clade.work_dir_ok():
            raise RuntimeError('Build base is not OK')
        meta = clade.get_meta()

        # This is required to get compiler (Aspectator) specific stdarg.h since kernel C files are compiled
        # with "-nostdinc" option and system stdarg.h couldn't be used.
        aspectator_search_dir = '-isystem' + klever.core.utils.execute(
            self.logger, ('aspectator', '-print-file-name=include'), collect_all_stdout=True)[0]

        env = dict(os.environ)
        # Print stubs instead of inline Assembler since verifiers do not interpret it and even can fail.
        env['LDV_INLINE_ASM_STUB'] = ''

        for grp in self.abstract_task_desc['grps']:
            self.logger.info('Weave in C files of group "{0}"'.format(grp['id']))

            for extra_cc in grp['Extra CCs']:
                # Each CC is either pair (compiler command identifier, compiler command type) or JSON file name
                # with compiler command description.
                if isinstance(extra_cc['CC'], list):
                    cc = clade.get_cmd(*extra_cc['CC'], with_opts=True)
                else:
                    with open(os.path.join(self.conf['main working directory'], extra_cc['CC']),
                              encoding='utf8') as fp:
                        cc = json.load(fp)

                    # extra_cc is a cc command that is not from Clade
                    # Thus paths in it need to be converted to be absolute
                    # like in other Clade commands
                    if "cwd" in cc and "in" in cc:
                        cc["in"] = [os.path.join(cc["cwd"], cc_in) for cc_in in cc["in"]]

                    if "cwd" in cc and "out" in cc:
                        cc["out"] = [os.path.join(cc["cwd"], cc_out) for cc_out in cc["out"]]

                if "in file" in extra_cc:
                    # This is for CC commands with several input files
                    infile = extra_cc["in file"]
                else:
                    infile = cc["in"][0]
                # Distinguish source files having the same names.
                outfile_unique = '{0}.c'.format(klever.core.utils.unique_file_name(os.path.splitext(os.path.basename(
                    infile))[0], '.c'))
                # This is used for storing/getting to/from cache where uniqueness is guaranteed by other means.
                outfile = '{0}.c'.format(os.path.splitext(os.path.basename(infile))[0])
                self.logger.info('Weave in C file "{0}"'.format(infile))

                # Produce aspect to be weaved in.
                if 'plugin aspects' in extra_cc:
                    self.logger.info('Concatenate all aspects of all plugins together')

                    # Resulting aspect.
                    aspect = 'aspect'

                    # Get all aspects. Place RSG aspects at beginning since they can instrument entities added by
                    # aspects of other plugins while corresponding function declarations still need be at beginning
                    # of file.
                    aspects = []
                    for plugin_aspects in extra_cc['plugin aspects']:
                        if plugin_aspects['plugin'] == 'RSG':
                            aspects[0:0] = plugin_aspects['aspects']
                        else:
                            aspects.extend(plugin_aspects['aspects'])

                    # Concatenate aspects.
                    with open(aspect, 'w', encoding='utf8') as fout, fileinput.input(
                            [os.path.join(self.conf['main working directory'], aspect) for aspect in aspects],
                            openhook=fileinput.hook_encoded('utf8')) as fin:
                        for line in fin:
                            fout.write(line)
                else:
                    # Instrumentation is not required when there is no aspects. But we will still pass source files
                    # through C-backend to make resulting code to look similarly and thus to avoid different issues
                    # at merging source files and models together.
                    aspect = None

                if aspect:
                    self.logger.info('Aspect to be weaved in is "{0}"'.format(aspect))
                else:
                    self.logger.info('C file will be passed through C Back-end only')

                storage_path = clade.get_storage_path(infile)
                if meta['conf'].get('Compiler.preprocess_cmds', False) and \
                        'klever-core-work-dir' not in storage_path:
                    storage_path = storage_path.split('.c')[0] + '.i'

                cwd = clade.get_storage_path(cc['cwd'])

                is_model = (grp['id'] == 'models')

                # Original sources should be woven in and we do not need to get cross references for them since this
                # was already done before.
                if not is_model:
                    self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd,
                                 aspectator_search_dir, is_model)
                # For generated models we need to weave them in (actually, just pass through C Back-end) and to get
                # cross references always since most likely they all are different.
                elif 'generated' in extra_cc:
                    self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd,
                                 aspectator_search_dir, is_model)
                    if self.conf['code coverage details'] != 'Original C source files':
                        self.__get_cross_refs(storage_path, cc['opts'], outfile_unique, clade, cwd,
                                              aspectator_search_dir)
                # For non-generated models use results cache in addition.
                else:
                    cache_dir = os.path.join(self.conf['cache directory'],
                                             klever.core.utils.get_file_checksum(storage_path))
                    with klever.core.utils.LockedOpen(cache_dir + '.tmp', 'w'):
                        if os.path.exists(cache_dir):
                            self.logger.info('Get woven in C file from cache')
                            self.abstract_task_desc['extra C files'].append(
                                {'C file': os.path.relpath(os.path.join(cache_dir, os.path.basename(outfile)),
                                                           self.conf['main working directory'])})
                            if self.conf['code coverage details'] != 'Original C source files':
                                self.logger.info('Get cross references from cache')
                                self.__merge_additional_srcs(os.path.join(cache_dir, 'additional sources'))
                        else:
                            os.makedirs(cache_dir)
                            self.__weave(storage_path, cc['opts'], aspect, outfile_unique, clade, env, cwd,
                                         aspectator_search_dir, is_model)
                            self.logger.info('Store woven in C file to cache')
                            shutil.copy(outfile_unique, os.path.join(cache_dir, outfile))

                            if self.conf['code coverage details'] != 'Original C source files':
                                self.__get_cross_refs(storage_path, cc['opts'], outfile_unique, clade, cwd,
                                                      aspectator_search_dir)
                                self.logger.info('Store cross references to cache')
                                shutil.copytree(outfile_unique + ' additional sources',
                                                os.path.join(cache_dir, 'additional sources'))

        # For auxiliary files there is no cross references since it is rather hard to get them from Aspectator. But
        # there still highlighting.
        if self.conf['code coverage details'] == 'All source files':
            for aux_file in glob.glob('*.aux'):
                new_file = os.path.join('additional sources', 'generated models',
                                        os.path.relpath(aux_file, self.conf['main working directory']))

                os.makedirs(os.path.dirname(new_file), exist_ok=True)
                shutil.copy(aux_file, new_file)

                cross_refs = CrossRefs(self.conf, self.logger, clade, aux_file, new_file, self.search_dirs)
                cross_refs.get_cross_refs()

        self.abstract_task_desc['additional sources'] = os.path.relpath('additional sources',
                                                                        self.conf['main working directory']) \
            if os.path.isdir('additional sources') else None

        # Copy additional sources for total code coverage.
        if self.conf['code coverage details'] != 'Original C source files':
            with klever.core.utils.Cd('additional sources'):
                for root, dirs, files in os.walk(os.path.curdir):
                    for file in files:
                        # These files are handled below in addition to corresponding source files.
                        if file.endswith('.json'):
                            continue

                        if self.conf['code coverage details'] == 'C source files including models' \
                                and not file.endswith('.c'):
                            continue

                        file = os.path.join(root, file)
                        new_file = os.path.join(self.conf['additional sources directory'], file)
                        os.makedirs(os.path.dirname(new_file), exist_ok=True)

                        with klever.core.utils.LockedOpen(new_file + '.tmp', 'w'):
                            if os.path.isfile(new_file):
                                os.remove(new_file + '.tmp')
                                continue

                            shutil.copy(file, new_file)
                            shutil.copy(file + '.idx.json', new_file + '.idx.json')

                            os.remove(new_file + '.tmp')

        # These sections won't be refereed any more.
        del (self.abstract_task_desc['grps'])
        del (self.abstract_task_desc['deps'])
示例#35
0
 def check(mode, expected_lines):
     fi = FileInput(files=TESTFN, mode=mode,
                    openhook=hook_encoded('utf-7'))
     lines = list(fi)
     fi.close()
     self.assertEqual(lines, expected_lines)
示例#36
0
def parse_csv():
    """Given the location of CSV and TXT files, parse the CSV for notable items"""

    error_output = list()
    Container = list()

    if Config.yara_folder and has_yara:
        yara_rules = yara_import_rules()
    else:
        yara_rules = ''
    if Config.debug:
        print('[_] Loaded rules:', type(yara_rules))

    # Use fileinput.input() now to read data line-by-line
    if Config.debug:
        print('[_] Parsing in CSV contents...')
    for original_line in fileinput.input(
            Config.csv_file, openhook=fileinput.hook_encoded('iso-8859-1')):
        evt = None
        server = ''
        # Ignore lines beginning w/ a tab or non-quote.
        if original_line[0] != '"':
            continue
        line = original_line.strip(whitespace + '"')
        field = line.strip().split('","')
        try:
            if field[3] in ['Process Create'] and field[5] == 'SUCCESS':
                cmdline = field[6].split('Command line: ')[1]
                if not blacklist_scan(cmd_blacklist, field):
                    if Config.generalize_paths:
                        cmdline = generalize_var(cmdline)
                    child_pid = field[6].split('PID: ')[1].split(',')[0]
                    evt = Event(time=field[0],
                                group='Process',
                                activity='CreateProcess',
                                process=field[1],
                                PID=field[2],
                                process_value=cmdline.replace('"', ''),
                                child_pid=child_pid)
            elif field[3] == 'CreateFile' and field[5] == 'SUCCESS':
                if not blacklist_scan(file_blacklist, field):
                    path = field[4]
                    if os.path.isdir(path):
                        if Config.generalize_paths:
                            path = generalize_var(path)
                        evt = Event(time=field[0],
                                    group='File',
                                    activity='CreateFolder',
                                    process=field[1],
                                    PID=field[2],
                                    process_value=path)
                    else:
                        yara_hits = ''
                        av_hits = ''

                        if Config.generalize_paths:
                            path = generalize_var(path)

                        evt = Event(time=field[0],
                                    group='File',
                                    activity='CreateFile',
                                    process=field[1],
                                    PID=field[2],
                                    process_value=path)
                        if file_exists(path):
                            if Config.debug:
                                print('[_] File: %s\texists' % path)
                            try:
                                md5 = md5_file(path)
                                evt.tags['MD5'] = md5
                                if Config.debug:
                                    print('[_]\t%s' % md5)
                            except (IndexError, IOError):
                                md5 = ''
                                if Config.debug:
                                    print('[_]\tMD5 could not be calculated')

                            if Config.yara_folder and yara_rules:
                                print('[*] Scanning with YARA: %s' % path)
                                yara_hits = yara_filescan(path, yara_rules)
                                if yara_hits:
                                    evt.tags['YARA'] = yara_hits
                                    if Config.debug:
                                        print('[_] YARA: %s' % yara_hits)
                                else:
                                    if Config.debug:
                                        print('[_] No YARA hits.')
                            if has_virustotal:
                                av_hits = virustotal_scan_file(md5)
                                if av_hits:
                                    evt.tags['VirusTotal'] = av_hits
                                    if Config.debug:
                                        print('[_] VT: %s' % av_hits)

            elif field[3] == 'SetDispositionInformationFile' and field[
                    5] == 'SUCCESS':
                if not blacklist_scan(file_blacklist, field):
                    path = field[4]
                    if Config.generalize_paths:
                        path = generalize_var(path)

                    evt = Event(time=field[0],
                                group='File',
                                activity='DeleteFile',
                                process=field[1],
                                PID=field[2],
                                process_value=path)
            elif field[3] == 'SetRenameInformationFile':
                if not blacklist_scan(file_blacklist, field):
                    from_file = field[4]
                    to_file = field[6].split('FileName: ')[1].strip('"')
                    if Config.generalize_paths:
                        from_file = generalize_var(from_file)
                        to_file = generalize_var(to_file)

                    evt = Event(time=field[0],
                                group='File',
                                activity='RenameFile',
                                process=field[1],
                                PID=field[2],
                                process_value='%s => %s' %
                                (from_file, to_file))
            elif field[3] == 'RegCreateKey' and field[5] == 'SUCCESS':
                if not blacklist_scan(reg_blacklist, field):
                    evt = Event(time=field[0],
                                group='Registry',
                                activity='RegCreateKey',
                                process=field[1],
                                PID=field[2],
                                process_value=field[4])
            elif field[3] == 'RegSetValue' and field[5] == 'SUCCESS':
                if not blacklist_scan(reg_blacklist, field):
                    reg_length = field[6].split('Length:')[1].split(
                        ',')[0].strip(whitespace + '"')
                    if int(reg_length):
                        data_field = field[6].split('Data:')[1].strip(
                            whitespace + '"')
                        if len(data_field.split(' ')) == 16:
                            data_field += ' ...'

                        evt = Event(time=field[0],
                                    group='Registry',
                                    activity='RegSetValue',
                                    process=field[1],
                                    PID=field[2],
                                    process_value='%s = %s' %
                                    (field[4], data_field))
            elif field[3] == 'RegDeleteValue':  # and field[5] == 'SUCCESS':
                # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists
                if not blacklist_scan(reg_blacklist, field):
                    evt = Event(time=field[0],
                                group='Registry',
                                activity='RegDeleteValue',
                                process=field[1],
                                PID=field[2],
                                process_value=field[4])
            elif field[3] == 'RegDeleteKey':  # and field[5] == 'SUCCESS':
                # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists
                if not blacklist_scan(reg_blacklist, field):
                    evt = Event(time=field[0],
                                group='Registry',
                                activity='RegDeleteKey',
                                process=field[1],
                                PID=field[2],
                                process_value=field[4])
            elif (field[3] == 'UDP Send'
                  or field[3] == 'UDP Receive') and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    hostname = server.split(':')[0]

                    # TODO: work on this later, once I can verify it better.
                    #if field[6] == 'Length: 20':
                    #    output_line = '[DNS Query] %s:%s > %s' % (field[1], field[2], protocol_replace(server))
                    #else:
                    evt = Event(time=field[0],
                                group='Network',
                                activity='UDP',
                                process=field[1],
                                PID=field[2],
                                process_value=protocol_replace(server),
                                hostname=hostname)
            elif (field[3] == 'TCP Send'
                  or field[3] == 'TCP Receive') and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    hostname = server.split(':')[0]

                    evt = Event(time=field[0],
                                group='Network',
                                activity='TCP',
                                process=field[1],
                                PID=field[2],
                                process_value=protocol_replace(server),
                                hostname=hostname)
        except IndexError:
            if Config.debug:
                sys.stderr.write(line)
                sys.stderr.write(format_exc())
            error_output.append(original_line.strip())

        if evt:
            Container.append(evt)

    if error_output:
        error_str = ''
        error_str += '\r\n\r\n\r\n\r\n\r\n\r\nERRORS DETECTED'
        error_str += 'The following items could not be parsed correctly:'
        for error in error_output:
            error_str += error

    #} End of file input processing
    return Container
示例#37
0
    sqlElements += ',%s'
    sqlUpdate += ',' + header[i] + '=values(' + header[i] + ')'

sqlVal = sqlVal.replace(",", "", 1)
sqlElements = sqlElements.replace(",", "", 1)
sqlUpdate = sqlUpdate.replace(",", "", 1)

for root, dirs, files in os.walk(os.path.join('./')):
    for name in files:
        fileName = os.path.join(root, name)
        if name.endswith('csv'):
            print(fileName)
            sql_value = []
            c = 0
            for line in fileinput.input(fileName,
                                        openhook=fileinput.hook_encoded(
                                            "utf-8", "surrogateescape")):
                if c == 0:
                    c = c + 1
                    continue

                contentList = line.split(',')
                tp = []
                for i in range(len(header)):
                    tp.append(contentList[i])

                sql_value.append(tuple(tp))

            cursor.execute("""
            IF OBJECT_ID('test', 'U') IS NOT NULL
                DROP TABLE test
            CREATE TABLE test (
示例#38
0
    def main(self):
        hist = []
        current = 0

        parser = argparse.ArgumentParser()

        parser.add_argument('--title', help='set the plot title')
        parser.add_argument('--encoding', help='E.g. iso-8895-1, utf-8')
        parser.add_argument('--fly',
                            help='Update the plot on the fly',
                            action='store_true')
        parser.add_argument('--verbose',
                            help='Output verbosely',
                            action='store_true')
        parser.add_argument('--run-command',
                            help='Output verbosely',
                            action='store_true')
        parser.add_argument(
            '--range-max',
            help='set the max value to filter out the abnormal',
            type=int)
        parser.add_argument(
            '--range-min',
            help='set the min value to filter out the abnormal',
            type=int)
        parser.add_argument('--xliml',
                            help='The left xlim in data coordinates',
                            type=float)
        parser.add_argument('--xlimr',
                            help='The right xlim in data coordinates',
                            type=float)
        parser.add_argument('--config', help='Config INI file')
        parser.add_argument('--output', help='Record log')
        parser.add_argument('file',
                            metavar='FILE',
                            help='files to read, if empty, stdin is used')
        args = parser.parse_args()

        if args.encoding:
            self.ENCODING = args.encoding
        if args.title:
            self.TITLE = args.title
        if args.range_max:
            self.range_max = args.range_max
        if args.range_min:
            self.range_min = args.range_min
        if args.xliml:
            self.XLIM_LEFT = args.xliml
        if args.xlimr:
            self.XLIM_RIGHT = args.xlimr
        if not args.run_command:
            self.command = None
        if args.output:
            self.log_output = args.output
        if args.config:
            self.config = args.config

        self.read_config_file()

        self.verbose = args.verbose
        self.UPDATE_ON_THE_FLY = args.fly
        if self.log_output:
            self.output_file = open(self.log_output, 'w')

        self.run_command()
        for line in fileinput.input(args.file,
                                    openhook=fileinput.hook_encoded(
                                        self.ENCODING)):
            line = line.strip()
            line_no = fileinput.lineno()
            self.line = line
            print_log = False
            print_current = False
            print_end = False

            for pattern in self.PRINT_PATTERN:
                if pattern in line:
                    self.t_current = LogTimeProfiler.parse_time(line)
                    print_log = True

            if not self.measure_started and self.MEASURE_START in line:
                self.measure_started = True
                print('measure_started =', self.measure_started)
                self.t_current = LogTimeProfiler.parse_time(line)
                self.t_request = self.t_current
                self.t_session_start = self.t_current
                print_log = True
            elif self.measure_started and self.MEASURE_END in line:
                self.measure_started = False
                print('measure_started =', self.measure_started)
                self.t_current = LogTimeProfiler.parse_time(line)
                self.t_response = self.t_current
                print_log = True

                current = round(self.t_response - self.t_request, 2)
                print(self.command, self.command_delay)
                if self.command and self.command_delay:
                    if self.action_timer:
                        self.action_timer.cancel()
                    self.action_timer = Timer(self.command_delay / 1000.0,
                                              self.run_command, [True])
                    self.action_timer.start()

                if current < 0:
                    self.print_log(line_no, line)
                    continue
                if self.range_min and current < self.range_min:
                    self.print_log(line_no, line)
                    continue
                if self.range_max and current > self.range_max:
                    self.print_log(line_no, line)
                    continue
                print_current = True
                print_end = True
                hist.append(current)
                self.test_count += 1

                if self.UPDATE_ON_THE_FLY:
                    self.show_plot(hist)

            if self.verbose and print_log:
                self.print_log(line_no, self.t_current - self.t_request, line)
            if print_current:
                self.print_log(current)
            if print_end:
                self.print_log('-' * 80)
            if self.command and self.command_count == self.test_count:
                break

        self.print_log('=' * 10, self.TITLE, 'Summary', '=' * 10)
        self.print_log('Result Count: {}'.format(len(hist)))
        self.print_log(
            'Benchmark: max = {}, min = {}, mean = {:.2f}, std = {:.2f}, mode = {:.2f}'
            .format(max(hist), min(hist), np.mean(hist), np.std(hist),
                    np.median(hist)))
        self.print_log(hist)
        self.show_plot(hist)
        plt.show()
示例#39
0
def run(target_dir, inplaceFlag=0):
    global showHeaderTableCount, alreadyChanged, showHeaderTableChanged
    retrieveFlag = 0 # 1: start retrieve; 2: end retrieve
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            showHeaderTable = ''
            oldHeader=''
            if file.endswith('.jsp') and (file.lower() in teoconstants.uipgms):
                print('Processing '+file)
                if inplaceFlag == 0:   #improve performance
                    f = fileinput.input(root+"\\"+ file, inplace=inplaceFlag, openhook=fileinput.hook_encoded('utf-8'))
                elif inplaceFlag == 1:
                    f = fileinput.input(root+"\\"+ file, inplace=inplaceFlag)

                for i, line in enumerate(f):
                    if(re.search('posui:showHeaderTable', line, re.IGNORECASE)):
                        showHeaderTableCount += 1
                        retrieveFlag = 1
                        showHeaderTable += line
                    if retrieveFlag == 1:
                        if(not re.search('posui:showHeaderTable', line))and(not re.search('/>', line)):
                            showHeaderTable += line
                            indent = (re.search('^(?P<indent>[ \t]*)[a-zA-Z\</\n]?',line,re.IGNORECASE)).group('indent')
                        if(re.search('headers.*\=.*"\<%\=',showHeaderTable)):   # E.g: headers = "<%=headerTit%>"
                            if inplaceFlag == 0:
                               print('  Unappropriate header found at line '+str(i+1))
                            elif inplaceFlag == 1:
                                print(line)
                            retrieveFlag = 2
                            continue
                        elif(re.search('headers.*\=',line)and(line.count('"')==2)):
                            headIndent = indent
                            oldHeader=(re.search('.*\"(?P<header>.*)\"',line)).group('header')
                            if(inplaceFlag == 0):
                                print('oldHeader case 1='+oldHeader)
                            line = appendToolTipsToNewHeader(oldHeader, headIndent)
                            if inplaceFlag == 1:
                                line = line.encode('utf-8')
                            oldHeader=''
                        elif(re.search('headers.*\=',line)and(line.count('"')==1)):
                            headIndent = indent
                            oldHeader=(re.search('.*\"(?P<header>.*)',line)).group('header')
                            continue
                        elif((oldHeader != '') and (line.count('"')==0)):
                            oldHeader += (re.search('^[ \t]*(?P<header>.*)',line)).group('header')
                            continue
                        elif((oldHeader != '') and (line.count('"')==1)):
                            oldHeader += (re.search('^[ \t]*(?P<header>.*)\"',line)).group('header')
                            oldHeader = oldHeader.replace('\r','')
                            oldHeader = re.sub(';$','',oldHeader)
                            if(inplaceFlag == 0):
                                print('oldHeader case 2='+oldHeader)
                            line = appendToolTipsToNewHeader(oldHeader, headIndent)
                            if inplaceFlag == 1:
                                line = line.encode('utf-8')
                            oldHeader=''
                        
                        if (re.search('/>', line)):
                            retrieveFlag = 2
                            if (not re.search('isMultiLang', showHeaderTable)):
                                line = indent+'isMultiLang="true"\n' \
                                       +indent+'tableEvent="nowrap style=\'table-layout:fixed\'"\n' \
                                       +indent+'toolTipLocales="en"\n' \
                                       +indent+'isColspanFix="true"\n' \
                                       +indent+'isNoToolTipScript="true"\n' \
                                       +line
                            else:
                                alreadyChanged += 1
                                if inplaceFlag == 0:
                                    print('fffffffffffffffffffffffff')
                            showHeaderTable += line
                            if inplaceFlag == 0:
                                sys.stdout.write(showHeaderTable)
                    if retrieveFlag == 2:
                        showHeaderTable = ''
                        retrieveFlag = 0
                                       
                        
                    if inplaceFlag == 1:
                        sys.stdout.write(line)

                f.close()
示例#40
0
import fileinput
import pprint

results = {}
words_to_look_for = ('monster', 'monsters')

with fileinput.input(openhook=fileinput.hook_encoded("utf-8")) as f:
    for line in f:
        for word in line.split(' '):
            word = word.lower().strip('"')
            if word in results:
                results[word] += 1
            else:
                if word in words_to_look_for:
                    results[word] = 1
print("Results: ")
pprint.pprint(results)
示例#41
0
def main(args):
    global _stash
    ap = argparse.ArgumentParser()
    ap.add_argument('pattern', help='the pattern to match')
    ap.add_argument('files', nargs='*', help='files to be searched')
    ap.add_argument('-i',
                    '--ignore-case',
                    action='store_true',
                    help='ignore case while searching')
    ap.add_argument('-v',
                    '--invert',
                    action='store_true',
                    help='invert the search result')
    ap.add_argument('-c',
                    '--count',
                    action='store_true',
                    help='count the search results instead of normal output')
    ns = ap.parse_args(args)

    flags = 0
    if ns.ignore_case:
        flags |= re.IGNORECASE

    pattern = re.compile(ns.pattern, flags=flags)

    # Do not try to grep directories
    files = [f for f in ns.files if not os.path.isdir(f)]

    fileinput.close()  # in case it is not closed
    try:
        counts = collections.defaultdict(int)
        for line in fileinput.input(files,
                                    openhook=fileinput.hook_encoded("utf-8")):
            if bool(pattern.search(line)) != ns.invert:
                if ns.count:
                    counts[fileinput.filename()] += 1
                else:
                    if ns.invert:  # optimize: if ns.invert, then no match, so no highlight color needed
                        newline = line
                    else:
                        newline = re.sub(
                            pattern,
                            lambda m: _stash.text_color(m.group(), 'red'),
                            line)
                    if fileinput.isstdin():
                        fmt = u'{lineno}: {line}'
                    else:
                        fmt = u'{filename}: {lineno}: {line}'

                    print(
                        fmt.format(filename=fileinput.filename(),
                                   lineno=fileinput.filelineno(),
                                   line=newline.rstrip()))

        if ns.count:
            for filename, count in counts.items():
                fmt = u'{count:6} {filename}'
                print(fmt.format(filename=filename, count=count))

    except Exception as err:
        print("grep: {}: {!s}".format(type(err).__name__, err),
              file=sys.stderr)
    finally:
        fileinput.close()
示例#42
0
def main():

    print("fsovs")

    Topic = []
    Utterance = []
    Relevance = []

    regex  = u'[^ぁ-ん]+'

    #学習用データ form[label, Topic & Utterce]
    wf_Data = open("Tpc&UTR_Stance.csv","w")

    all_filepaths=glob.glob('./training/*')
    for filepath in all_filepaths:
        lines = [line.rstrip() for line in fileinput.input(
            filepath, openhook=fileinput.hook_encoded('utf-8'))]

        # JSON全体の文法チェック
        try:
            arguments = json.loads('\n'.join(lines))
        except json.JSONDecodeError as e:
            print('エラーあり')
            print(e)
            exit(1)

        # Display title
        #print(arguments[0]["Topic"])
        
        for argument in arguments:
            Topic.append(argument["Topic"])
            Utterance.append(argument["Utterance"])
            Relevance.append(argument["Stance"])       

    TrueDataset = {}
    correctAnswer_0 = 0
    correctAnswer_1 = 0
    for line in list(set(Utterance)): 
        T_List = [] 
        R_list = []
        for line_l in range(len(Utterance)):
            if line == Utterance[line_l]:
                T_List.append(Topic[line_l])
                R_list.append(Relevance[line_l])
        TrueDataset[Counter(T_List).most_common()[0][0] + ":" + line] = str(Counter(R_list).most_common()[0][0])

    sorted(TrueDataset.items())

    # Analyze Utterance using Juman++ & knp
    jumanpp = Jumanpp()
    with open("incorrectTrus.txt","w") as wf:
        line_cnt = len(TrueDataset)
        now_line_cnt = 0
        for key, label in TrueDataset.items():
            tpc,utr = key.split(":")[0],key.split(":")[1]
            topANDutrANDlabelList = []

            #parse Topic
            topic_analyed_List = []
            topANDutrANDlabelList.append("Topic")
            try:
                #0.7909880035111675
                #s = tpc.split("を")[-2] + "を" + tpc.split("を")[-1].split("べきである")[0] 
                #topic_result = jumanpp.analysis(s)
                topic_result = jumanpp.analysis(format_text(tpc))
                #print(s)
                for mrph in topic_result.mrph_list():
                    try :
                        if len(re.findall(regex, mrph.genkei)) > 0:
                            if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                if "数量" in mrph.imis:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append("[数]") 
                                else:
                                    topic_analyed_List.append(mrph.genkei)
                                    topANDutrANDlabelList.append(mrph.genkei)
                    except:
                        continue
            except:
                continue

        #parse Utterance
            utter_analyed_List = []
            topANDutrANDlabelList.append("Utterance")
            try:
                if "、" in utr:
                    utrList = utr.split("、")
                    for sentence in utrList:

                        #reigi
                        if sentence == "":
                            continue
                        
                        utter_result = jumanpp.analysis(sentence)
                        for mrph in utter_result.mrph_list():
                            try :
                                if len(re.findall(regex, mrph.genkei)) > 0:
                                    if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                        if "数量" in mrph.imis:
                                            utter_analyed_List.append(mrph.genkei)
                                            topANDutrANDlabelList.append("[数]") 
                                        else:
                                            utter_analyed_List.append(mrph.genkei)
                                            topANDutrANDlabelList.append(mrph.genkei)

                                else:
                                    continue
                            except:
                                print("error")
                                continue

                else:
                    utter_result = jumanpp.analysis(utr)
                    for mrph in utter_result.mrph_list():
                        try :
                            if len(re.findall(regex, mrph.genkei)) > 0:
                                if ("名詞" in mrph.hinsi or "動詞" in mrph.hinsi):
                                    if "数量" in mrph.imis:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append("[数]") 
                                    else:
                                        utter_analyed_List.append(mrph.genkei)
                                        topANDutrANDlabelList.append(mrph.genkei)
                        except:
                            print("error")
                            continue
                topANDutrANDlabelList.append("END")
                    
            except:
                print("error")
                continue

            if "END" in topANDutrANDlabelList:
                #print(topANDutrANDlabelList)
                wf_Data.write(str(label) + "," + " ".join(topANDutrANDlabelList[:-1])+"\n")
            #print((set(topic_analyed_List) & set(utter_analyed_List)),len(set(topic_analyed_List) & set(utter_analyed_List)))

            #if (len(set(topic_analyed_List) & set(utter_analyed_List)) > 0):
                #print("1:",label)
            if int(label) == 1:
                wf.write(tpc + ":" + utr + "[" + "1" + ":" +label + "]\n")
            elif int(label) == 2:
                wf.write(tpc + ":" + utr + "[" + "2" + ":" +label + "]\n")
            else:
                wf.write(tpc + ":" + utr + "[" + "0" + ":" +label + "]\n")
示例#43
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/3/10 17:36
# @Author  : JJkinging
# @File    : test.py
import fileinput

symptom = list(
    map(
        lambda x: x.strip(),
        fileinput.FileInput(
            r'D:\python_project\AI_doctor\doctor_offline\structured\reviewed\2型糖尿病.csv',
            openhook=fileinput.hook_encoded('utf-8'))))
print(symptom)
示例#44
0
def main():
    parser = argparse.ArgumentParser(
        description="""Takes a file containing "word_i marker word_j" tuples
        and builds a coocurrence count -core space- for each marker.
        It will also build a sparse matrix for detected
        compositions -peripheral space- 
        (e.g. word_i1<-->word_i2 marker word_j""")

    parser.add_argument('input',
                        help="coocurrence tuples",
                        default="-",
                        nargs='*')
    parser.add_argument('-v', '--verbose', action='count', default=0)
    parser.add_argument(
        '-o',
        '--output_dir',
        help="directory where a coocurrence count file will be created "
        "for each pattern",
        required=True)
    parser.add_argument('-x',
                        '--compose-op',
                        help='string using to identify'
                        ' a peripheral space token',
                        default='<-->')
    parser.add_argument('-c', '--cols', help='filter context words')
    parser.add_argument('-r', '--rows', help='filter pivots')
    parser.add_argument('-m',
                        '--many',
                        help='number of records needed to '
                        'start dumping',
                        type=int,
                        default=MANY)
    parser.add_argument('-b',
                        '--batch-size',
                        help='size of batchs inserted '
                        'into the DB',
                        type=int,
                        default=BATCH_SIZE)
    parser.add_argument('-e',
                        '--db-engine',
                        help="Destination format",
                        choices=['mysql', 'sqlite', 'text'],
                        default='text')
    parser.add_argument('--asynchronic',
                        dest='synchronic',
                        help='continue counting while saving',
                        action='store_false',
                        default=True)
    parser.add_argument('-u',
                        '--mysql_user',
                        help='MYSQL username',
                        default=MYSQL_USER)
    parser.add_argument('-p',
                        '--mysql_passwd',
                        help='MYSQL password',
                        default=MYSQL_PASS)
    parser.add_argument('-H',
                        '--mysql_hostname',
                        help='MYSQL hostname',
                        default=MYSQL_HOST)
    parser.add_argument('-P',
                        '--mysql_port',
                        help='MySQL port',
                        default=MYSQL_PORT,
                        type=int)
    #TODO: add option to customize dense or sparse

    args = parser.parse_args()
    if args.verbose == 0:
        logger.setLevel(logging.ERROR)
    if args.verbose == 1:
        logger.setLevel(logging.INFO)
    if args.verbose == 2:
        logger.setLevel(logging.DEBUG)

    logger.info("Started at {0}".format(str(
        time.strftime("%d-%m-%Y %H:%M:%S"))))
    #make sure outdir exists
    try:
        os.makedirs(args.output_dir)
    except OSError:
        pass

    if args.cols:
        with open(args.cols) as f_cols:
            cols = [col.rstrip('\n') for col in f_cols]
        col2id = dict((col, i) for i, col in enumerate(cols))
    else:
        cols = None
        col2id = None
    if args.rows:
        with open(args.rows) as f_rows:
            rows = [row.rstrip('\n') for row in f_rows]
        row2id = dict((row, i) for i, row in enumerate(rows))
    else:
        rows = None
        row2id = None

    if args.db_engine == 'mysql':
        per_output_db = args.output_dir + '_peripheral'
        core_output_db = args.output_dir + '_core'
        per_dest = MySQLDestination(args.hostname, args.port, args.user,
                                    args.passwd, per_output_db, ['cc'],
                                    args.batch_size)
        core_dest = MySQLDestination(args.hostname, args.port, args.user,
                                     args.passwd, core_output_db, ['cc'],
                                     args.batch_size)
    elif args.db_engine == 'sqlite':
        per_output_db = os.path.join(args.output_dir, 'peripheral.db')
        core_output_db = os.path.join(args.output_dir, 'core.db')
        per_dest = SqliteDestination(per_output_db, args.batch_size)
        core_dest = SqliteDestination(core_output_db, args.batch_size)
    elif args.db_engine == 'text':
        per_output_db = os.path.join(args.output_dir, 'peripheral')
        core_output_db = os.path.join(args.output_dir, 'core')
        per_dest = TextDestination(per_output_db)
        core_dest = TextDestination(core_output_db)

    with core_dest, per_dest:
        core = SparseCounter(core_dest, args.many, args.synchronic)
        per = SparseCounter(per_dest, args.many, args.synchronic)

        with Timer() as t_counting:
            try:
                i = 0
                for l in fileinput.input(
                        args.input, openhook=fileinput.hook_encoded("utf-8")):
                    i += 1
                    if i % 100000 == 0:
                        sys.stdout.write('.')
                    if i % 10000000 == 0:
                        sys.stdout.write('\n')
                    sys.stdout.flush()
                    [w1, w2] = l.rstrip('\n').split('\t')
                    if args.compose_op in w1:
                        tg = w1.split(args.compose_op)[1]
                        if (not row2id or tg in row2id) and (not col2id
                                                             or w2 in col2id):
                            per.count(w1, 'c', w2)
                    else:
                        if (not row2id or w1 in row2id) and (not col2id
                                                             or w2 in col2id):
                            core.count(w1, 'c', w2)
            except ValueError:
                logger.error("Error reading line: {0}".format(l))

        logger.info("Counting Finished (t={0:.2f})".format(
            t_counting.interval))
        #wait for any pending saves
        core.join()
        per.join()
        #save residuals
        while len(core) > 0:
            core.save()
        while len(per) > 0:
            per.save()
    logger.info("Finished at {0}".format(
        str(time.strftime("%d-%m-%Y %H:%M:%S"))))
示例#45
0
                'L':  {'P': Counter(numer=0, denom=0), 'R': Counter(numer=0, denom=0)}}
     return confs, softPR
 
 data = defaultdict(newConfsMap)   # labelset => confs map
 
 
 nSeqs = Counter()   # label set => number of sequences having some (predicted or gold) label in the set
 nTokens = Counter() # label set => number of tokens in the sequences corresponding to this label set
 allLabels = set()
 
 global nIgnoredTokens, nIgnoredSeqs
 nIgnoredTokens = 0
 nIgnoredSeqs = 0
 
 sys.stdin = codecs.getreader("utf-8")(sys.stdin)
 for seq in loadSequences(fileinput.input(args.conllFiles, openhook=fileinput.hook_encoded("utf-8")), scheme):
     tkns,golds,preds = zip(*seq)
     tkns,golds,preds = list(tkns),list(golds),list(preds)
     labelsThisSeq = set(itm[1] for itm in golds+preds if itm[0]!='O')
     allLabels.update(labelsThisSeq)
     
     selectedLbls = args.l
     if selectedLbls:
         lblsets = {tuple(selectedLbls)} # a specific subset of labels
     elif args.L:
         lblsets = {()}  # all labels
     else:
         lblsets = {(lbl,) for lbl in allLabels} | {()}  # all labels, plus each label individually
     
     for lblset in lblsets:
         if lblset==('LOC',):
示例#46
0
def run(target_dir, inplaceFlag=0):
    global showSelectListCount, alreadyChanged, showSelectListChanged, labelList
    retrieveFlag = 0  # 1: start retrieve; 2: end retrieve
    showTextFields = ''
    previousLineIndent = ''
    for root, dirs, files in os.walk(target_dir):
        for file in files:
            if file.endswith('.jsp') and (file.lower() in teoconstants.uipgms):
                print('Processing ' + file)
                if inplaceFlag == 0:  #improve performance
                    f = fileinput.input(
                        root + "\\" + file,
                        inplace=inplaceFlag,
                        openhook=fileinput.hook_encoded('utf-8'))
                elif inplaceFlag == 1:
                    f = fileinput.input(root + "\\" + file,
                                        inplace=inplaceFlag)

                label = ''
                staticValues = ''
                totalValue = ''
                for i, line in enumerate(f):
                    if (re.search('posui:showSelectList', line,
                                  re.IGNORECASE)):
                        showSelectListCount += 1
                        retrieveFlag = 1
                        showTextFields += line
                    if retrieveFlag == 1:
                        if line in ('\n', '\r\n'):  # ignore blank line
                            continue
                        if (not re.search('/>', line)) and (not re.search(
                                'posui:showSelectList', line)):
                            previousLineIndent = (re.search(
                                '^(?P<indent>[ \t]*)[a-zA-Z\</\n]?', line,
                                re.IGNORECASE)).group('indent')
                            if re.search(
                                    'label.*\=',
                                    line):  ##1111111111111111111111111111111
                                if (re.search("\<.*\>", line)):
                                    if inplaceFlag == 1:
                                        sys.stdout.write(line)
                                        continue
                                else:
                                    m = re.search(
                                        '(?P<before>^.*)label.*=.*"(?P<label>.*)"',
                                        line)
                                    label = m.group('label')
                                    keyfoundFlag = 0
                                    keyFound = ''
                                    subList = []
                                    subList.append(file)
                                    subList.append(i + 1)
                                    subList.append(label)
                                    if inplaceFlag == 1:
                                        label = label.decode('utf-8')
                                    keyFound = findCorrespondentKey(label)
                                    if keyFound != label:
                                        keyfoundFlag = 1
                                        subList.append(1)
                                        keyFound = re.sub(
                                            '_000\d', '_0000', keyFound)
                                        line = m.group(
                                            'before'
                                        ) + 'label="' + keyFound + '"\n'
                                        ##                                        if inplaceFlag == 0:
                                        ##                                            print('FOUNDDDDDDD:'+ keyFound)
                                        if inplaceFlag == 1:
                                            line = line.encode('utf-8')
                                    else:
                                        subList.append(0)
                                    labelMainList.append(subList)

                            elif (re.search(
                                    'staticValues.*\=',
                                    line)):  ##222222222222222222222222222
                                if inplaceFlag == 0:
                                    ##                                    print('staticValues FOUND ==>'+line)
                                    pass
                                if (
                                        re.search("\<.*\>", line)
                                ):  ## E.g: totalValue="<%=PosM800500099ConstantsIF.C_LOV_ALL_VALUE%>"
                                    if inplaceFlag == 1:
                                        sys.stdout.write(line)
                                        continue
                                else:
                                    m = re.search(
                                        '(?P<before>^.*)staticValues.*=.*"(?P<staticValues>.*)"',
                                        line)
                                    staticValues = m.group('staticValues')
                                    staticValues = re.sub(
                                        '|$', '', staticValues)
                                    newString = ''
                                    for value in re.split('\|', staticValues):
                                        subList = []
                                        subList.append(file)
                                        subList.append(i + 1)
                                        subList.append(value)
                                        if inplaceFlag == 1:
                                            value = value.decode('utf-8')
                                        keyFound = findCorrespondentKey(value)
                                        if keyFound != value:
                                            subList.append(1)
                                            keyFound = re.sub(
                                                '_000\d', '_0000', keyFound)
                                            newString += keyFound + '|'
                                            if inplaceFlag == 0:
                                                print('FOUNDDDDDDD:' +
                                                      keyFound)
                                        else:
                                            subList.append(0)
                                            newString += value + '|'
                                            if inplaceFlag == 0:
                                                print(
                                                    'CAN NOT FIND APPROPRIATE KEY for staticValues'
                                                )
                                        labelMainList.append(subList)
                                    newString = re.sub('\|$', '', newString)
                                    line = m.group(
                                        'before'
                                    ) + 'staticValues="' + newString + '"\n'
                                    if inplaceFlag == 1:
                                        line = line.encode('utf-8')
                            elif (re.search('totalValue.*\=',
                                            line)):  ##3333333333333333333333
                                ##                                if inplaceFlag == 0:
                                ##                                    print('totalValue FOUND ==>'+line)
                                if (
                                        re.search("\<.*\>", line)
                                        or re.search('"-+"', line)
                                ):  ## E.g: totalValue="<%=PosM800500099ConstantsIF.C_LOV_ALL_VALUE%>"   ; totalValue="------------"
                                    if inplaceFlag == 1:
                                        sys.stdout.write(line)
                                        continue
                                else:
                                    m = re.search(
                                        '(?P<before>^.*)totalValue.*=.*"(?P<totalValue>.*)"',
                                        line)
                                    totalValue = m.group('totalValue')
                                    keyfoundFlag = 0
                                    subList = []
                                    subList.append(file)
                                    subList.append(i + 1)
                                    subList.append(totalValue)
                                    if inplaceFlag == 1:
                                        totalValue = totalValue.decode('utf-8')
                                    keyFound = findCorrespondentKey(totalValue)
                                    if keyFound != totalValue:
                                        keyfoundFlag = 1
                                        subList.append(1)
                                        line = m.group(
                                            'before'
                                        ) + 'totalValue="' + keyFound + '"\n'
                                        if inplaceFlag == 0:
                                            print('FOUNDDDDDDD:' + keyFound)
                                        elif inplaceFlag == 1:
                                            line = line.encode('utf-8')
                                    else:
                                        subList.append(0)
                                    labelMainList.append(subList)

                            showTextFields += line
                    if (re.search('/>', line)) and (retrieveFlag == 1):
                        retrieveFlag = 2
                        if re.search('isMultiLang', showTextFields) \
                           or re.search('isLabelMultiLang', showTextFields) \
                           or re.search('isTotalValueMultiLang', showTextFields) :
                            alreadyChanged += 1
                        if staticValues and not re.search(
                                'isMultiLang', showTextFields):
                            line = re.sub(
                                '^',
                                previousLineIndent + 'isMultiLang="true"\n',
                                line)
                        else:
                            if label and totalValue == '' and not re.search(
                                    'isLabelMultiLang',
                                    showTextFields):  # only label
                                line = re.sub(
                                    '^', previousLineIndent +
                                    'isLabelMultiLang="true"\n', line)
                            elif totalValue and label == '' and not re.search(
                                    'isTotalValueMultiLang',
                                    showTextFields):  # only totalValue
                                line = re.sub(
                                    '^', previousLineIndent +
                                    'isTotalValueMultiLang="true"\n', line)
                            elif totalValue and label and not re.search(
                                    'isLabelMultiLang',
                                    showTextFields) and not re.search(
                                        'isTotalValueMultiLang',
                                        showTextFields):
                                line = re.sub(
                                    '^', previousLineIndent +
                                    'isLabelMultiLang="true"\n', line)
                                line = re.sub(
                                    '^', previousLineIndent +
                                    'isTotalValueMultiLang="true"\n', line)

                        showTextFields += line
                        if inplaceFlag == 0:
                            sys.stdout.write(showTextFields)
                    if retrieveFlag == 2:
                        retrieveFlag = 0
                        keyFound = ''
                        showTextFields = ''
                        label = ''
                        staticValues = ''
                        totalValue = ''

                    if inplaceFlag == 1:
                        sys.stdout.write(line)

                f.close()
示例#47
0
                         (station, CODE, julian, latitude, longitude, epic_date))

        # write station header in decimal with PRES = -1, and 5 decimals
        # --------------------------------------------------------------
        xml_file.write("%3d  %4d %9.5f %8.5f %9.5f %s 1e36 1e36 1e36 1e36\n" %
                       (station, CODE, julian, latitude, longitude, epic_date))

    # substitute .hdr or .HDR in fileName with .asc
    # ---------------------------------------------
    fileName = re.sub(r'\.(?i)hdr$', '.asc', fileName)

    # we don't use __builtin__ readline method of file object that haven't
    # method to get read line number
    # --------------------------------------------------------------------
    file = fileinput.input(
        fileName, openhook=fileinput.hook_encoded("ISO-8859-1"))

    # iterate over the lines of opened file "fileName"
    # ------------------------------------------------
    for line in file:

        # skip header line
        # ----------------
        if file.isfirstline():
            continue
        else:

            # extract data
            # ------------
            (scan, TimeJ, Pres, Depth, T0, T1, C0, C1, v1, v2, v1dt, v2dt, Xmiss, FlC, Aqua, Ox0, Ox1, S0, S1, sigmateta0,
             sigmateta1, sndvel0, sndvel1, nbin, flag) = line.split()
示例#48
0
    s_len = len(s)
    ngrams = []
    for n in xrange(1, min(size + 1, s_len + 1)):
        for i in xrange(s_len - n + 1):
            ngrams.append(s[i:i + n])
    return ngrams


def crfsuite_features(word, size, left_tpl, right_tpl):
    res = StringIO()
    for k in xrange(1, len(word)):
        left, right = word[:k], word[k:]
        left_size = min(len(left), size)
        right_size = min(len(right), size)
        print >> res, '%s\t%s' % (left_tpl[left_size - 1] % tuple(
            _char_ngrams(left[-size:], size)), right_tpl[right_size - 1] %
                                  tuple(_char_ngrams(right[:size], size)))
    return res.getvalue()


if __name__ == '__main__':
    N = 4  # n-gram size
    left_tpl = [crfsuite_feature_names(k, True) for k in xrange(1, N + 1)]
    right_tpl = [crfsuite_feature_names(k, True) for k in xrange(1, N + 1)]

    for word in fileinput.input(openhook=fileinput.hook_encoded("utf8")):
        print crfsuite_features(word.strip().lower(),
                                size=N,
                                left_tpl=left_tpl,
                                right_tpl=right_tpl)
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import fileinput
import struct

subFormat = '4s 2x 8s 1x 2s 2x 8s 1x 2s 2x'

fi = fileinput.FileInput(openhook=fileinput.hook_encoded("utf_16_le"))
outfile = 0  # output file, sets value so that we wouldn't close the file
# the first time round
insub = 0  # when 1, processing a multiline subtitle

while 1:
    line = fi.readline()
    print line
    if line == '':
        break

    # If new file, initialize the subtitle counter and open a new file
    if (fi.isfirstline()):
        outfilename = (fi.filename())[0:len(fi.filename()) - 3] + u'srt'
        # close the previous file
        if outfile != 0:
示例#50
0
    _msg(
        " done.\nWords before: %d, words after: %d.\n"
        "(words constringed: %d, bytes saved: %d)\n%s\n", wcount, len(wlist),
        c_wcount, c_bsaved, '-' * 60)

    # myspell'o þodyno pradþioje -- þodþiø kiekis.
    if myspell:
        outfile.write(len(wlist) + '\n')

    outfile.writelines(wlist)


if __name__ == "__main__":
    outfile = sys.stdout
    # Nuo v2.5+ fileinput galima nurodyti openhook'à (dekodavimas ið
    # norimos koduotës). Aktualu tik py3 (py2 dirba su byte strings;
    # perkodavimas á unikodà nebûtinas), taèiau openhook'as neveikia
    # su stdin.
    if sys.version_info >= (3, ):
        import io
        if not sys.argv[1:]:
            # jei nëra argumentø, tai duomenys ið stdin
            sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding=enc)
        outfile = io.TextIOWrapper(sys.stdout.buffer, encoding=enc)
        _fileinput = fileinput.input(openhook=fileinput.hook_encoded(enc))
    else:
        _fileinput = fileinput.input()

    sutrauka(_fileinput, outfile=outfile, myspell=False)
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import fileinput
import sys, glob
import time

# Toma cualquier nombre del libro con comienzo "Busi_" y terminación ".txt"
archivos = glob.glob("../Social Sciences books/training/Soci_*.txt")
archivos.sort()

for linea in fileinput.input(archivos,
                             openhook=fileinput.hook_encoded("utf-8")):
    if fileinput.isfirstline():
        # Files name
        book = fileinput.filename()

        Busi_1 = open(book, encoding="utf-8").read()

        Busi1 = nltk.word_tokenize(Busi_1)

        Busi1 = [w.lower() for w in Busi1 if w.isalpha()]

        stop_words = set(stopwords.words('english'))

        filtered_book = [w for w in Busi1 if not w in stop_words]

        single_character = (
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'eg', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
示例#52
0
# This script is for extracting the grammar from the rust docs.

import fileinput

collections = {
    "gram": [],
    "keyword": [],
    "reserved": [],
    "binop": [],
    "unop": []
}

in_coll = False
coll = ""

for line in fileinput.input(openhook=fileinput.hook_encoded("utf-8")):
    if in_coll:
        if line.startswith("~~~~"):
            in_coll = False
        else:
            if coll in ["keyword", "reserved", "binop", "unop"]:
                for word in line.split():
                    if word not in collections[coll]:
                        collections[coll].append(word)
            else:
                collections[coll].append(line)

    else:
        if line.startswith("~~~~"):
            for cname in collections:
                if ("." + cname) in line:
示例#53
0
def parse_csv(csv_file, report, timeline):
    """
    Given the location of CSV and TXT files, parse the CSV for notable items

    Arguments:
        csv_file: path to csv output to parse
    Results:
        report: string text containing the entirety of the text report
        timeline: string text containing the entirety of the CSV report
    """
    process_output = list()
    file_output = list()
    reg_output = list()
    net_output = list()
    error_output = list()
    remote_servers = list()
    if yara_folder and has_yara:
        yara_rules = yara_import_rules(yara_folder)
    else:
        yara_rules = ''

    # Use fileinput.input() now to read data line-by-line
    for original_line in fileinput.input(
            csv_file, openhook=fileinput.hook_encoded('iso-8859-1')):
        server = ''
        if original_line[
                0] != '"':  # Ignore lines that begin with Tab. Sysinternals breaks CSV with new processes
            continue
        line = original_line.strip(whitespace + '"')
        field = line.strip().split('","')
        try:
            if field[3] in ['Process Create'] and field[5] == 'SUCCESS':
                cmdline = field[6].split('Command line: ')[1]
                if not blacklist_scan(cmd_blacklist, field):
                    if generalize_paths:
                        cmdline = generalize_var(cmdline)
                    child_pid = field[6].split('PID: ')[1].split(',')[0]
                    outputtext = '[CreateProcess] %s:%s > "%s"\t[Child PID: %s]' % (
                        field[1], field[2], cmdline.replace('"',
                                                            ''), child_pid)
                    timelinetext = '%s,Process,CreateProcess,%s,%s,%s,%s' % (
                        field[0].split()[0].split('.')[0], field[1], field[2],
                        cmdline.replace('"', ''), child_pid)
                    process_output.append(outputtext)
                    timeline.append(timelinetext)

            elif field[3] == 'CreateFile' and field[5] == 'SUCCESS':
                if not blacklist_scan(file_blacklist, field):
                    path = field[4]
                    if os.path.isdir(path):
                        if generalize_paths:
                            path = generalize_var(path)
                        outputtext = '[CreateFolder] %s:%s > %s' % (
                            field[1], field[2], path)
                        timelinetext = '%s,File,CreateFolder,%s,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2], path)
                        file_output.append(outputtext)
                        timeline.append(timelinetext)
                    else:  # This is for actual files. It's a huge try/except, sorry.
                        try:
                            md5 = md5_file(path)

                            yara_hits = ''
                            if yara_folder and yara_rules:
                                yara_hits = yara_filescan(path, yara_rules)

                            av_hits = ''
                            if has_virustotal:
                                av_hits = virustotal_scan_file(md5)

                            if generalize_paths:
                                path = generalize_var(path)

                            outputtext = '[CreateFile] %s:%s > %s\t[MD5: %s]%s%s' % (
                                field[1], field[2], path, md5, yara_hits,
                                av_hits)
                            timelinetext = '%s,File,CreateFile,%s,%s,%s,%s' % (
                                field[0].split()[0].split('.')[0], field[1],
                                field[2], path, md5)
                            file_output.append(outputtext)
                            timeline.append(timelinetext)
                        except (IndexError, IOError):
                            if generalize_paths:
                                path = generalize_var(path)
                            outputtext = '[CreateFile] %s:%s > %s\t[File no longer exists]' % (
                                field[1], field[2], path)
                            timelinetext = '%s,File,CreateFile,%s,%s,%s,N/A' % (
                                field[0].split()[0].split('.')[0], field[1],
                                field[2], path)
                            file_output.append(outputtext)
                            timeline.append(timelinetext)

            elif field[3] == 'SetDispositionInformationFile' and field[
                    5] == 'SUCCESS':
                if not blacklist_scan(file_blacklist, field):
                    path = field[4]
                    if generalize_paths:
                        path = generalize_var(path)
                    outputtext = '[DeleteFile] %s:%s > %s' % (
                        field[1], field[2], field[4])
                    timelinetext = '%s,File,DeleteFile,%s,%s,%s' % (
                        field[0].split()[0].split('.')[0], field[1], field[2],
                        path)
                    file_output.append(outputtext)
                    timeline.append(timelinetext)

            elif field[3] == 'SetRenameInformationFile':
                if not blacklist_scan(file_blacklist, field):
                    from_file = field[4]
                    to_file = field[6].split('FileName: ')[1].strip('"')
                    if generalize_paths:
                        from_file = generalize_var(from_file)
                        to_file = generalize_var(to_file)
                    outputtext = '[RenameFile] %s:%s > %s => %s' % (
                        field[1], field[2], from_file, to_file)
                    timelinetext = '%s,File,RenameFile,%s,%s,%s,%s' % (
                        field[0].split()[0].split('.')[0], field[1], field[2],
                        from_file, to_file)
                    file_output.append(outputtext)
                    timeline.append(timelinetext)

            elif field[3] == 'RegCreateKey' and field[5] == 'SUCCESS':
                if not blacklist_scan(reg_blacklist, field):
                    outputtext = '[RegCreateKey] %s:%s > %s' % (
                        field[1], field[2], field[4])
                    if not outputtext in reg_output:  # Ignore multiple CreateKeys. Only log the first.
                        timelinetext = '%s,Registry,RegCreateKey,%s,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2], field[4])
                        reg_output.append(outputtext)
                        timeline.append(timelinetext)

            elif field[3] == 'RegSetValue' and field[5] == 'SUCCESS':
                if not blacklist_scan(reg_blacklist, field):
                    reg_length = field[6].split('Length:')[1].split(
                        ',')[0].strip(whitespace + '"')
                    if int(reg_length):
                        data_field = field[6].split('Data:')[1].strip(
                            whitespace + '"')
                        if len(data_field.split(' ')) == 16:
                            data_field += ' ...'
                        outputtext = '[RegSetValue] %s:%s > %s  =  %s' % (
                            field[1], field[2], field[4], data_field)
                        timelinetext = '%s,Registry,RegSetValue,%s,%s,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2], field[4], data_field)
                        reg_output.append(outputtext)
                        timeline.append(timelinetext)

            elif field[3] == 'RegDeleteValue':  # and field[5] == 'SUCCESS':
                # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists
                if not blacklist_scan(reg_blacklist, field):
                    outputtext = '[RegDeleteValue] %s:%s > %s' % (
                        field[1], field[2], field[4])
                    timelinetext = '%s,Registry,RegDeleteValue,%s,%s,%s' % (
                        field[0].split()[0].split('.')[0], field[1], field[2],
                        field[4])
                    reg_output.append(outputtext)
                    timeline.append(timelinetext)

            elif field[3] == 'RegDeleteKey':  # and field[5] == 'SUCCESS':
                # SUCCESS is commented out to allows all attempted deletions, whether or not the value exists
                if not blacklist_scan(reg_blacklist, field):
                    outputtext = '[RegDeleteKey] %s:%s > %s' % (
                        field[1], field[2], field[4])
                    timelinetext = '%s,Registry,RegDeleteKey,%s,%s,%s' % (
                        field[0].split()[0].split('.')[0], field[1], field[2],
                        field[4])
                    reg_output.append(outputtext)
                    timeline.append(timelinetext)

            elif field[3] == 'UDP Send' and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    # TODO: work on this later, once I can verify it better.
                    #if field[6] == 'Length: 20':
                    #    output_line = '[DNS Query] %s:%s > %s' % (field[1], field[2], protocol_replace(server))
                    #else:
                    outputtext = '[UDP] %s:%s > %s' % (
                        field[1], field[2], protocol_replace(server))
                    if not outputtext in net_output:
                        timelinetext = '%s,Network,UDP Send,%s,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2], protocol_replace(server))
                        net_output.append(outputtext)
                        timeline.append(timelinetext)

            elif field[3] == 'UDP Receive' and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    outputtext = '[UDP] %s > %s:%s' % (
                        protocol_replace(server), field[1], field[2])
                    if not outputtext in net_output:
                        timelinetext = '%s,Network,UDP Receive,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2])
                        net_output.append(outputtext)
                        timeline.append(timelinetext)

            elif field[3] == 'TCP Send' and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    outputtext = '[TCP] %s:%s > %s' % (
                        field[1], field[2], protocol_replace(server))
                    if not outputtext in net_output:
                        timelinetext = '%s,Network,TCP Send,%s,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2], protocol_replace(server))
                        net_output.append(outputtext)
                        timeline.append(timelinetext)

            elif field[3] == 'TCP Receive' and field[5] == 'SUCCESS':
                if not blacklist_scan(net_blacklist, field):
                    server = field[4].split('-> ')[1]
                    outputtext = '[TCP] %s > %s:%s' % (
                        protocol_replace(server), field[1], field[2])
                    if not outputtext in net_output:
                        timelinetext = '%s,Network,TCP Receive,%s,%s' % (
                            field[0].split()[0].split('.')[0], field[1],
                            field[2])
                        net_output.append(outputtext)
                        timeline.append(timelinetext)

        except IndexError:
            if debug:
                sys.stderr.write(line)
                sys.stderr.write(format_exc())
            error_output.append(original_line.strip())

        # Enumerate unique remote hosts into their own section
        if server:
            server = server.split(':')[0]
            if not server in remote_servers and server != 'localhost':
                remote_servers.append(server)
    #} End of file input processing

    report.append('Processes Created:')
    report.append('==================')
    for event in process_output:
        report.append(event)

    report.append('')
    report.append('File Activity:')
    report.append('==================')
    for event in file_output:
        report.append(event)

    report.append('')
    report.append('Registry Activity:')
    report.append('==================')
    for event in reg_output:
        report.append(event)

    report.append('')
    report.append('Network Traffic:')
    report.append('==================')
    for event in net_output:
        report.append(event)

    report.append('')
    report.append('Unique Hosts:')
    report.append('==================')
    for server in sorted(remote_servers):
        report.append(protocol_replace(server).strip())

    if error_output:
        report.append('\r\n\r\n\r\n\r\n\r\n\r\nERRORS DETECTED')
        report.append('The following items could not be parsed correctly:')
        for error in error_output:
            report.append(error)
示例#54
0
def carregaDorks():
    linhas = input(openhook=hook_encoded("ISO-8859-1"))
    dorks = []
    for linha in linhas:
        dorks.append(removeCRLF(linha))
    return dorks
示例#55
0
 def check(errors, expected_lines):
     with FileInput(files=TESTFN, mode='r',
                    openhook=hook_encoded('utf-8', errors=errors)) as fi:
         lines = list(fi)
     self.assertEqual(lines, expected_lines)
示例#56
0
 def check(mode, expected_lines):
     with FileInput(files=TESTFN, mode=mode,
                    openhook=hook_encoded('utf-7')) as fi:
         lines = list(fi)
     self.assertEqual(lines, expected_lines)
示例#57
0
文件: head.py 项目: zychen/stash
def main(args):
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("-n",
                   "--lines",
                   default=10,
                   type=int,
                   help="""print the first K lines instead of 10;
                   if negative, print the last -K lines""")
    p.add_argument("-q",
                   "--quiet",
                   "--silent",
                   action='store_true',
                   help="never print headers for each file")
    p.add_argument("-v",
                   "--verbose",
                   action='store_true',
                   help="always print headers for each file")
    p.add_argument("files", action="store", nargs="*", help="files to print")
    ns = p.parse_args(args)

    status = 0

    header_fmt = '==> {} <==\n'

    if len(ns.files) == 0:
        ns.files = ['-']

    try:
        for fname in ns.files:
            if ns.verbose or (len(ns.files) > 1 and not ns.quiet):
                if fname == '-':
                    print(header_fmt.format('standard input'), end='')
                else:
                    print(header_fmt.format(fname), end='')

            fileinput.close()
            inp = fileinput.input(fname,
                                  openhook=fileinput.hook_encoded("utf-8"))
            if ns.lines >= 0:
                buf = []
                for i, line in enumerate(inp):
                    if i >= ns.lines:
                        break
                    buf.append(line)
                for line in buf:
                    print(line, end='')
            else:
                buf = []
                for line in fileinput.input(
                        inp, openhook=fileinput.hook_encoded("utf-8")):
                    buf.append(line)
                    if len(buf) > -ns.lines:
                        del buf[0]
                for line in buf:
                    print(line, end='')

    except Exception as e:
        print('head :%s' % str(e))
        status = 1
    finally:
        fileinput.close()

    sys.exit(status)
示例#58
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--output",
                        "-o",
                        help="write output to file instead of stdout")
    parser.add_argument(
        "--split",
        "-s",
        help=
        "if writing to file, split into multiple files with this many lines per "
        "file",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--extra-field",
        "-e",
        help="extra fields to include. Provide a field name and a pointer to "
        "the field. Example: -e verified user.verified",
        nargs=2,
        action="append",
    )
    parser.add_argument("--excel",
                        "-x",
                        help="create file compatible with Excel",
                        action="store_true")
    parser.add_argument(
        "files",
        metavar="FILE",
        nargs="*",
        help="files to read, if empty, stdin is used",
    )
    args = parser.parse_args()

    file_count = 1
    csv_file = None
    if args.output:
        if args.split:
            csv_file = codecs.open(numbered_filepath(args.output, file_count),
                                   "wb", "utf-8")
            file_count += 1
        else:
            csv_file = codecs.open(args.output, "wb", "utf-8")
    else:
        csv_file = sys.stdout
    sheet = csv.writer(csv_file)

    extra_headings = []
    extra_fields = []
    if args.extra_field:
        for heading, field in args.extra_field:
            extra_headings.append(heading)
            extra_fields.append(field)

    sheet.writerow(get_headings(extra_headings=extra_headings))

    files = args.files if len(args.files) > 0 else ("-", )
    for count, line in enumerate(
            fileinput.input(files, openhook=fileinput.hook_encoded("utf-8"))):
        if args.split and count and count % args.split == 0:
            csv_file.close()
            csv_file = codecs.open(numbered_filepath(args.output, file_count),
                                   "wb", "utf-8")
            sheet = csv.writer(csv_file)
            sheet.writerow(get_headings(extra_headings=extra_headings))
            file_count += 1
        tweet = json.loads(line)
        sheet.writerow(
            get_row(tweet, extra_fields=extra_fields, excel=args.excel))
示例#59
0
    def labels_nl(self, fname = LABELS_NL):
        '''
            <http://nl.dbpedia.org/resource/Aannemer> <http://www.w3.org/2000/01/rdf-schema#label> "Aannemer"@nl .
            "lastpart" : { "type" : "string"},
            "lastpart_str" : { "type" : "string", "index": "not_analyzed" },
            "pref_title" : {"type" : "string"},
            "pref_title_str" : {"type" :"string", "index" : "not_analyzed"},
            "title" : { "type" : "string"},
            "title_str" : {"type" : "string", "index" : "not_analyzed" },
            "org_title" : {"type" : "string"},
            "org_title_str" : {"type" : "string", "index" : "not_analyzed"},
        '''

        INPUT_RE_STR = {
            'id_nl' : REGEX_LIST['id_nl'],
            'label' : REGEX_LIST['label'],
        }

        DISAMBIG = ['doorverwijspagina', 'disambiguation']

        self.type_op = "update" # Type of operation for ES
        input_re = {}
        dbpedia_obj = {}

        total_found = 0
        total_not_found = 0

        for regex in INPUT_RE_STR:
            input_re[regex] = []
            for rule in INPUT_RE_STR[regex]:
                input_re[regex].append(re.compile(rule))

        self.commit_buffer = []
        self.commit_total = 0
        self.commit = 0

        for line in fileinput.input(files=[fname],
                                    openhook=fileinput.hook_encoded("utf-8")):
            obj = {}
            for regex in input_re:
                for reg in input_re[regex]:
                    key = value = None
                    match_obj = reg.match(line)

                    if not match_obj:
                        continue

                    key = regex
                    value = match_obj.group(1)

                    obj[key] = value

            '''
            for item in DISAMBIG:
                if value and value.find(item) > -1:
                    continue
            
            disambig = 0
            if value and value.find('(') > -1:
                disambig = 1
            '''

            if obj:
                res = ES.search(index=ES_INDEX_NAME, q='id_nl:"%s"' % obj['id_nl'])

                if not res.get('hits').get('total') == 1:
                    self.no_id_found += 1
                    print(res.get('hits').get('total'))
                elif res.get('hits').get('total') == 1:
                    obj['id'] = res.get('hits').get('hits')[0].get('_id')
                    obj['lastpart'] = obj['lastpart_str'] = normalize(value).split('(')[0].strip().split(' ')[-1]
                    obj['pref_title'] = obj['pref_title_str'] = value
                    obj['title'] = obj['title_str'] = normalize(value)
                    obj['org_title'] = obj['org_title_str'] = value
                    self.commit_buffer.append(obj)
                    self.commit += 1

            if self._check_commit():
                break
		print(str(directory))
		print(str(file))

		#convert to html in temp file
		f = open(file, 'rb')
		b = open(temp, 'wb')
		document = mammoth.convert_to_html(f)
		b.write(document.value.encode('utf8'))
		b.close()
		f.close()

		#create a find-and-replace statement
		i = 0
		c = len(find)
		string = "x = line"
		while i < c: 
			x = ".replace(find[" + str(i) + "], replace[" + str(i) + "])"
			string = string + x
			i = i + 1

		#write output file
		f = open(output, 'wb')
		with fileinput.FileInput(temp, inplace=False, openhook=fileinput.hook_encoded('utf-8', 'surrogateescape')) as file:
			for line in file:
				#execute the find-and-replace statement
				exec(string)
				f.write(x.encode('utf-8'))
		f.close()

#remove temp file
os.remove(temp)