Python nextfile示例，fileinput.nextfile Python示例

示例#1

0

显示文件

文件： get_revision.py 项目： hwilson94/ovf2vtk

def get_revision_version():
    """goes through all files in lib and sums the last number in the
    CVS revision strings."""
    files=[]
    print "Considering these files for release tag:"
    for d in ['Lib','bin',]:
        for f in os.listdir(d):
            if f[-3:]=='.py' or f=='ovf2vtk':
                fn = os.path.join(d,f)
                if os.path.exists(fn):
                    files.append(fn)
                    print "  ... %s ..." % (fn)
                else:
                    print 'File "%s" does not exists. Skipping.'%(fn)

    revision_version = 0
    for l in fileinput.input(files):
        m = re.match(r'.*?\$Re[v]ision:\s*\d+[.](?P<rev>\d+)\s*\$',l)
        if m:
            revision_version = revision_version + eval(m.group('rev'))
            fileinput.nextfile()

    print "Done. Version is %s" % str(revision_version)

    return revision_version

示例#2

0

显示文件

文件： test_fileinput.py 项目： LPRD/build_tools

 def test_state_is_None(self):
     """Tests fileinput.nextfile() when fileinput._state is None.
        Ensure that it raises RuntimeError with a meaningful error message
        and does not modify fileinput._state"""
     fileinput._state = None
     with self.assertRaises(RuntimeError) as cm:
         fileinput.nextfile()
     self.assertEqual(("no active input()",), cm.exception.args)
     self.assertIsNone(fileinput._state)

示例#3

0

显示文件

 def test_state_is_None(self):
     """Tests fileinput.nextfile() when fileinput._state is None.
        Ensure that it raises RuntimeError with a meaningful error message
        and does not modify fileinput._state"""
     fileinput._state = None
     with self.assertRaises(RuntimeError) as cm:
         fileinput.nextfile()
     self.assertEqual(("no active input()", ), cm.exception.args)
     self.assertIsNone(fileinput._state)

示例#4

0

显示文件

文件： fileInputTest.py 项目： westboycome/python

def get_file_name(file):
    if os.path.exists(file) and os.path.isfile(file):
        names = []
        for line in fileinput.input(file):
            name = fileinput.filename()
            if name != None:
                fileinput.nextfile()
            names.append(name)
        return names
    else:
        print('the path [{}] is not exits!'.format(file))

示例#5

0

显示文件

文件： HMMuniq.py 项目： nixnmtm/bioman

def CHKSUMfilter(files):
    CKSMlist = set()
    CHK = re.compile('^CKSUM')
    for hmmline in fileinput.input(files):
        result = CHK.search(hmmline)
        if result:
            if hmmline not in CKSMlist:
                shutil.copy(fileinput.filename(), out)
                CKSMlist.add(hmmline)
            fileinput.nextfile()

    fileinput.close()

示例#6

0

显示文件

文件： fileinput_module.py 项目： zw-999/learngit

def get_file_name(file):
    '''只有当文件被读的时候才会取得filename,否则返回None'''
    if os.path.exists(file) and os.path.isfile(file):
        names = []
        for line in fileinput.input(file):
            name = fileinput.filename()
            if name != None:
                fileinput.nextfile()
            names.append(name)
            return names
    else:
        print 'path [{}] is not exists~'.format(file)

示例#7

0

显示文件

文件： bmn-HMMuniq.py 项目： alunem/bioman

def CHKSUMfilter(files):
	CKSMlist = set()
	CHK = re.compile('^CKSUM')
	for hmmline in fileinput.input(files):
		result = CHK.search(hmmline)
		if result:
			if hmmline not in CKSMlist:
				shutil.copy(fileinput.filename(),out)
				CKSMlist.add(hmmline)
			fileinput.nextfile()

	fileinput.close()

示例#8

0

显示文件

文件： fahmf_functions.py 项目： 3keepmovingforward3/ENGR1102

def my_glob(p, args, s, base_hash):
    for file in glob.iglob('{}/{}/{}'.format(p, s, args.filename), recursive=args.flag_recursive):
        if os.path.isdir(file):
            fileinput.nextfile()
        else:
            with os.scandir(os.path.dirname(os.path.realpath(file))) as it:
                for entry in it:
                    if pathlib2.PurePath(entry).suffix == '.hash':
                        internal_hash_check(entry, args)
        for line in fileinput.input(file):
            folder_path_to_save_in, filename = pathname_finder(file)
            file_hashed = hash_make(line, base_hash)
            hash_write_to_file(file_hashed, folder_path_to_save_in, args.filename)
            fileinput.nextfile()

示例#9

0

显示文件

文件： HMMfilterTC.py 项目： nixnmtm/bioman

def TCfilter(files, cutoff):
    allFiles = fileinput.input(files)
    for line in allFiles:
        if line[0:3] == "TC ":
            TC = line.split()[1]
            if TC == "------":
                fileinput.nextfile()
            else:
                TC = float(TC)
            if TC < cutoff:
                shutil.copy(fileinput.filename(), out)
            else:
                fileinput.nextfile()
    fileinput.close()

示例#10

0

显示文件

文件： markdown_index.py 项目： barriebarnes/markacross

 def _get_markdown_title(self, markdown_file):
     """
     Return the first line of the given file having stripped off any leading #'s or spaces.
     Truncate the result to 50 chars
     """
     for line in fileinput.input(markdown_file):
         title = line
         break;
     fileinput.nextfile()
     
     matches = re.match(r'^[#]*[ ]*([ \S]+)\n$', title)
     title = matches.group(1)
     title = title[:50]
     title = title.replace("\n", "")
     return title

示例#11

0

显示文件

    def parse(self):
        skip = -1
        skipping = None
        title = None
        brand = ''
        ingredients = None
        calories = None
        filename = ''

        for line in fileinput.input():
            if fileinput.isfirstline():
                if title:
                    self.result.append({
                        'name':
                        self.decode(title),
                        'url':
                        urllib.parse.unquote(self.name2url(filename)),
                        'brand':
                        self.decode(brand),
                        'calories':
                        calories,
                        'ingredients':
                        ingredients,
                    })
                title = None
                brand = ''
                ingredients = None
                filename = fileinput.filename()
            if title is None and '<title>' in line:
                title = line.split('>')[1].split(',')[0]
            if not brand and 'brand:' in line:
                brand = line.split("'")[1]
            if skip > 0:
                skip -= 1
            elif skip == 0:
                skip = -1
                if skipping == 'nutri':
                    ingredients = self.getIngredients(self.decode(line))
                elif skipping == 'caloric':
                    calories = self.getCalories(self.decode(line))
                    fileinput.nextfile()
            elif NUTRI_INFO in line:
                skip = 1
                skipping = 'nutri'
            elif CALORIC in line:
                skip = 1
                skipping = 'caloric'

示例#12

0

显示文件

    def test_missing_debug_statements(self):
        message = "\nFound a debug missing statement at line %d of file %r: %r"
        filename = None
        source_files = glob.glob(os.path.join(self.source_dir, '*.py')) + \
            glob.glob(os.path.join(self.source_dir, '*/*.py'))

        for line in fileinput.input(source_files):
            if fileinput.isfirstline():
                filename = os.path.basename(fileinput.filename())
                if filename == 'generate_categories.py':
                    fileinput.nextfile()
                    continue

            lineno = fileinput.filelineno()

            match = self.missing_debug.search(line)
            self.assertIsNone(
                match, message %
                (lineno, filename, match.group(0) if match else None))

示例#13

0

显示文件

文件： preprocess.py 项目： liamZheng/python

def extractTitle(inPath, outPath):
    "extract title from texts and put all into one file"
    labels = os.listdir(inPath)
    outFile = open(outPath, 'w')
    num = {}
    for label in labels:
        num[label] = 0
        for line in fileinput.input(glob.glob(inPath+"/"+label+"/*")):
            if not line.startswith('http://'):
                line_uni = unicode(line, 'gbk', 'ignore')
                 #  tokenize
                tokens = " ".join(jieba.cut(line_uni))
                outFile.write( "%s\t%s" % (label, tokens))
                num[label]+=1
                fileinput.nextfile()
        fileinput.close()
    outFile.close()
    print "Extract Files"
    for label, n in num.iteritems():
        print "%-10s : %d" % (label, n)

示例#14

0

显示文件

def extractTitle(inPath, outPath):
    "extract title from texts and put all into one file"
    labels = os.listdir(inPath)
    outFile = open(outPath, 'w')
    num = {}
    for label in labels:
        num[label] = 0
        for line in fileinput.input(glob.glob(inPath + "/" + label + "/*")):
            if not line.startswith('http://'):
                line_uni = unicode(line, 'gbk', 'ignore')
                #  tokenize
                tokens = " ".join(jieba.cut(line_uni))
                outFile.write("%s\t%s" % (label, tokens))
                num[label] += 1
                fileinput.nextfile()
        fileinput.close()
    outFile.close()
    print "Extract Files"
    for label, n in num.iteritems():
        print "%-10s : %d" % (label, n)

示例#15

0

显示文件

文件： util.py 项目： ayr0/lookup

def load_bib_lines(filenames):
    """Load *.tex files and read them line by line.
    This method only loads the bibliography section and checks for ascii"""
    
    bibliography = {}
    bibsection = 0
    biberrors = 0
    filenames = expandFilenames(filenames)
    for line in fileinput.input(filenames, mode='rU'):
        #iterate until we get to a bibitem section
        line = line.strip()
        if line.startswith(r"\begin{thebibliography}"):
            #mark lines
            bibitems = []
            bibsection = 1
            continue
        elif line.startswith(r"\end{thebibliography}"):
            bibliography[fileinput.filename()] = bibitems
            bibitems = []
            bibsection = 0
            fileinput.nextfile()

        if bibsection == 1:
            if not line.isspace():
                try:
                    line = line.decode("ascii")
                    candline = removeComment(line)
                    if candline:
                        bibitems.append(candline)
                except UnicodeDecodeError:
                    print "Special Character on line {0} in file {1}".format(fileinput.filelineno(), fileinput.filename())
                    print line
                    print "-".center(80, '-')
                    biberrors += 1
    
    if biberrors > 0:
        print "{0} errors detected.  Received non-ASCII input".format(biberrors)
        #return an empty list so we don't process bad output
        return []
    
    return split_bibitems(bibliography)

示例#16

0

显示文件

def create_grep_match_generator(regexp,
                                paths,
                                is_file_only=False,
                                path_pfx=os.environ.get('PATH_PFX')):
    '''Create grep pattern match generator'''

    files = _create_files_list(xform_args(paths, path_pfx))

    try:
        for line in fileinput.input(files):
            if re.search(regexp, line):
                line = line.rstrip('\r\n')
                end_txt = '' if is_file_only else ':{0}'.format(line)

                yield fileinput.filename() + end_txt

                if is_file_only:
                    fileinput.nextfile()
    except UnicodeDecodeError as exc:
        #print(repr(exc)
        pass

示例#17

0

显示文件

 def test_state_is_not_None(self):
     """Tests fileinput.nextfile() when fileinput._state is not None.
        Ensure that it invokes fileinput._state.nextfile() exactly once,
        returns whatever it returns, and does not modify fileinput._state
        to point to a different object."""
     nextfile_retval = object()
     instance = MockFileInput()
     instance.return_values["nextfile"] = nextfile_retval
     fileinput._state = instance
     retval = fileinput.nextfile()
     self.assertExactlyOneInvocation(instance, "nextfile")
     self.assertIs(retval, nextfile_retval)

示例#18

0

显示文件

文件： test_fileinput.py 项目： LPRD/build_tools

 def test_state_is_not_None(self):
     """Tests fileinput.nextfile() when fileinput._state is not None.
        Ensure that it invokes fileinput._state.nextfile() exactly once,
        returns whatever it returns, and does not modify fileinput._state
        to point to a different object."""
     nextfile_retval = object()
     instance = MockFileInput()
     instance.return_values["nextfile"] = nextfile_retval
     fileinput._state = instance
     retval = fileinput.nextfile()
     self.assertExactlyOneInvocation(instance, "nextfile")
     self.assertIs(retval, nextfile_retval)
     self.assertIs(fileinput._state, instance)

示例#19

0

显示文件

文件： utilities.py 项目： fuzzyklein/workshop

def input_files():
    """ Return a list containing tuples such that return[0] is the name of the
        file and return[1] is a slurped string of the file's contents.

        # TODO: This function doesn't work if there were options passed.
                In fact, it attempts to open option strings as filenames.
                Option strings could be removed from `sys.argv`, I guess.


        This function hangs if it doesn't get input. Press Ctrl-D (EOF) to continue.
    """
    OPT_STRS = list()
    for item in PARSER_ARGUMENTS:
        OPT_STRS += item[0]
    i = 1
    while i < len(sys.argv):
        s = sys.argv[i].split('=')[0].strip()

        if s in OPT_STRS:
            t = sys.argv.pop(i)
            # if s in {"-c","--config","--logfile","--tempdir","-i","--input","-o","--output"}:
            #     t = sys.argv.pop(i)
        else: i+=1

    files = []
    s = ''
    for line in fileinput.input():
        if fileinput.isstdin():
            s += line
        elif fileinput.isfirstline():
            fname = fileinput.filename()
            with open(fname) as f:
                files.append((fname, f.read()))
            fileinput.nextfile()
    if s:
        files.append((None, s))
    return files

示例#20

0

显示文件

文件： wget.py 项目： osfans/python3-tool

 def parse(self, key=None, encoding="U8"):
     files = self.files
     globs = sorted(glob.glob(files), key=key)
     if globs:
         for line in fileinput.input(globs, openhook=fileinput.hook_encoded(encoding)):
             if "<title>" in line:
                 self.s=""
                 title=self.gettitle(line.strip()[7:-8])
                 if not title:
                     self.s=""
                     fileinput.nextfile()
                     continue
             if self.is_need(line):
                 self.s+=line.strip()+"\n"
                 if self.is_end(line):
                     if self.getbody() == "":
                         self.s=""
                         fileinput.nextfile()
                         continue
                     yield title, self.getsubtitle(), self.getauthor(), self.getbody()
                     self.s=""
                     fileinput.nextfile()

示例#21

0

显示文件

文件： wget.py 项目： qq2632554/python3-tool

 def parse(self, key=None, encoding="U8"):
     files = self.files
     globs = sorted(glob.glob(files), key=key)
     if globs:
         for line in fileinput.input(
                 globs, openhook=fileinput.hook_encoded(encoding)):
             if "<title>" in line:
                 self.s = ""
                 title = self.gettitle(line.strip()[7:-8])
                 if not title:
                     self.s = ""
                     fileinput.nextfile()
                     continue
             if self.is_need(line):
                 self.s += line.strip() + "\n"
                 if self.is_end(line):
                     if self.getbody() == "":
                         self.s = ""
                         fileinput.nextfile()
                         continue
                     yield title, self.getsubtitle(), self.getauthor(
                     ), self.getbody()
                     self.s = ""
                     fileinput.nextfile()

示例#22

0

显示文件

文件： file_input.py 项目： shudd23/python-labs

#-*- coding: UTF-8 -*-
__author__ = 'mcxiaoke'

import os,fileinput
# 内部使用FileInput类实现
# fileinput 是一个从标准输入或者文件按行读入的工具类，默认是文本模式，一般的用法是

for line in fileinput.input(['file_input.py']):
    print fileinput.filename() # 文件名
    print fileinput.fileno() #文件描述符，int
    print fileinput.lineno() #总的行号
    print fileinput.filelineno() #当前文件的行号
    print fileinput.isstdin() # 是否标准输入
    print fileinput.isfirstline() # 是否是文件的第一行
    print fileinput.nextfile() # 关闭当前文件，开始读下一个文件
    print fileinput.close() # 关闭输入序列

示例#23

0

显示文件

文件： extract_game_data.py 项目： bamesserly/wc3data

def main():
  good_games = 0
  total_games = 0
  short_games = 0
  bugged_games = 0
  game_list = []

  Wfile = open("list_of_games", "w")

  #Loop file lines
  for line in fileinput.input(glob.glob("/Users/Ben/Desktop/Misc_Docs/wcscans/processedFiles/Solo/game*.txt")):
    #print fileinput.filename()

    #Make sure game is greater than zero mins
    if fileinput.filelineno()==5: 
      if "Game Length: 0 minutes" in line:
        #print "Game Length Zero"
        fileinput.nextfile()
        short_games += 1
      if "Game Length: 1 minutes" in line:
        #print "Game Length One"
        fileinput.nextfile()
        short_games += 1

    #Make sure game is solo
    if fileinput.filelineno()==4:
      if "Solo" not in line:
        print "Not Solo"
        fileinput.nextfile()


    if fileinput.isfirstline(): # First line, initialize
      #print fileinput.filename()
      game_info = {}
      game_info.fromkeys(game_fields)
      total_games += 1

    #Get game date/time
    if fileinput.filelineno()==2:
      game_info['date_time'] = get_date_time(line)

    #player names, races, levels, winner
    try:
      if fileinput.filelineno()==7:
        game_info['player1_name'] = get_player_name(line)
        game_info['player1_race'] = get_player_race(line)

      if fileinput.filelineno()==8:
        game_info['player1_level'] = get_player_level(line)

      if fileinput.filelineno()==10:
        game_info['player2_name'] = get_player_name(line)
        game_info['player2_race'] = get_player_race(line)

      if fileinput.filelineno()==11:
        game_info['player2_level'] = get_player_level(line)

      if fileinput.filelineno()==12:
        game_info['winning_player'] = get_winning_player(line, game_info)

    except:
      bugged_games += 1
      fileinput.nextfile()

    #finalize
    if fileinput.filelineno()>12:
      if any (field not in game_info for field in game_fields):
        print "One of the fields for this game is empty, failed to read game."
        fileinput.nextfile()
      else:
        good_games += 1
        game_list.append(game_info)
        if good_games % 1000 == 0:
          print "Scanned over " + str(good_games) + " games"
        fileinput.nextfile()

  print "Short games not counted = " + str(short_games)
  print "Bugged games not counted = " + str(bugged_games)
  print "Good games = " + str(good_games)
  print "Total games = " + str(total_games)
  #print "\n".join(str(v) for v in game_list)
  Wfile.write("list_of_games = " + str(game_list))

示例#24

0

显示文件

文件： more.py 项目： irusist/python-study

# webbrowser模块
import webbrowser
# webbrowser.open("https://github.com")

# fileinput模块
import fileinput
# 可以读取参数传入文件，将每一行作为一个迭代器，可以用for来迭代
# 如python some.py f1.txt f2.txt
print fileinput.input()
fileinput.filename()
fileinput.lineno()
fileinput.filelineno()
fileinput.isfirstline()
fileinput.isstdin()
fileinput.nextfile()
# fileinput.close()

# set模块
print set(range(10))    # set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# set取唯一的值
print set([0, 1, 2, 3, 0, 3])   # set([0, 1, 2, 3])
a = set([1, 2, 3])
b = set([2, 3, 4])
print a.union(b)    # set([1, 2, 3, 4])
print a | b         # set([1, 2, 3, 4])  __or__实现
c = a & b           # __and__实现
print c             # set([2, 3])
print c.issubset(a)     # True
print c <= a            # True  __le__实现
print c.issuperset(a)   # False

示例#25

0

显示文件

		linenumber = fileinput.filelineno()
		prevline = line
	yield prevline, True

pattern = re.compile(r'(<<(\/?) *(if|for|else|switch|case|replace|link)[^a-zA-Z][^<>]*)')


tagfound = []
try:
	for line, isLastLine in yield_line_and_islastline(fileinput.input()):
		for (whole,end,tag) in re.findall(pattern,line):
			if tag == "else" or tag == 'case':
				if len(tagfound) == 0:
					myprint("Found", tag, "but with no opening tag:")
					myprint("  ", linenumber,":", whole)
					fileinput.nextfile()
				lasttag = tagfound[-1]
				if (tag == "else" and lasttag["tag"] != "if") or (tag == "case" and lasttag["tag"] != "switch"):
					myprint("Mismatched else: Opening tag was:")
					myprint("  ",lasttag["linenumber"],":", lasttag["whole"])
					myprint("But this tag was:")
					myprint("  ",linenumber,":", whole)
					fileinput.nextfile()
					break
			elif end != '/':
				tagfound.append({"whole": whole, "linenumber":linenumber,"tag":tag})
			else:
				if len(tagfound) == 0:
					myprint("Found closing tag but with no opening tag:")
					myprint("  ", linenumber,":", whole)
					fileinput.nextfile()

示例#26

0

显示文件

 def nextfile(file):
     "close current file and moves to the next"
     return fileinput.nextfile()

示例#27

0

显示文件

def main_work():

    #################################################

    # root is one level below this file in directory structure, ie. below the 'scripts' folder
    ROOT = os.path.split(
        os.path.realpath(
            os.path.abspath(
                os.path.dirname(inspect.getfile(
                    inspect.currentframe())))))[0] + '/'

    dirs = {
        'ROOT': ROOT,
        'CONFIG': ROOT + "configs/",
        'VOICES': ROOT + "voices/",
        'TRAIN': ROOT + "train/",
        'RULES': ROOT + "rules/",
        'CORPUS': ROOT + "corpus/",
        'BIN': ROOT + "/tools/bin/"
    }

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-s', dest='speaker', required=True, \
                    help= "the name of the speaker: <ROOT>/corpus/<LANG>/<SPEAKER>")
    a.add_argument('-l', dest='lang', required=True, \
                    help= "the language of the speaker: <ROOT>/corpus/<LANG>")
    a.add_argument('-o', dest='output', required=False, default=False, \
                    help= "output audio here")
    a.add_argument('-t', dest='stage', required=False, default="runtime", \
                    help=""" defines the current usage stage 
                            (definitions of stages should by found in <config>/recipe.cfg""")
    a.add_argument('-play', dest='play', action="store_true", required=False, default=False, \
                    help=" play audio after synthesis")
    a.add_argument('-lab', dest='make_label', action="store_true", default=False, \
                    help= "make label file as well as wave in output location")
    a.add_argument('config',
                   help="""configuration to use: naive, semi-naive, gold, 
                                    as defined in <ROOT>/recipes/<config> -directory"""
                   )
    a.add_argument('-bin', dest='custom_bindir')
    a.add_argument('files',
                   nargs='*',
                   help="text files to speak, reading from stdin by default")
    a.add_argument('-m',
                   dest='model_dir',
                   required=True,
                   type=str,
                   help="model directory")
    opts = a.parse_args()

    dirs['TRAIN'] = opts.model_dir + "/train/"
    dirs['VOICES'] = opts.model_dir + "/voices/"

    if opts.custom_bindir != None:
        dirs['BIN'] = opts.custom_bindir

    voice_location = os.path.join(dirs['VOICES'], opts.lang, opts.speaker,
                                  opts.config)
    train_location = os.path.join(dirs['TRAIN'], opts.lang, "speakers",
                                  opts.speaker, opts.config)
    config_path = os.path.join(dirs['CONFIG'], opts.config)
    voice_config = os.path.join(config_path, fname.RECIPE)

    ## Make Voice object to contain voice elements trained on this corpus:
    voice = Voice(opts.speaker, opts.lang, opts.config, opts.stage, dirs)

    if not opts.output:
        output_wavefile = os.path.join(voice_location, 'output', 'wav',
                                       'temp.wav')
    else:
        output_wavefile = opts.output

    if not opts.output:
        output_labfile = None
    else:
        output_labfile = output_wavefile.replace('.wav', '.lab')

    prevspace = False
    para = []
    # Go through the files a paragraph at a time, unless it's SSML in which case we parse it
    # An empty line marks the change of paragraphs in plain text files
    for line in fileinput.input(opts.files):
        line = line.decode('utf-8').rstrip()
        t = start_clock('Synthesise sentence')
        print line
        if fileinput.isfirstline():
            if para != []:
                voice.synth_utterance(''.join(para), output_wavefile=output_wavefile, \
                             output_labfile=output_labfile)
                if opts.play:
                    os.system('play ' + output_wavefile)
                para = []
            line = line.lstrip()
            if line.startswith('<speak') or line.startswith('<xml'):
                tree = etree.parse(fileinput.filename())
                parseSSML(tree, voice)
                fileinput.nextfile()
            else:
                para.append(line)
        elif line.isspace():
            prevspace = True
        elif prevspace and para != []:
            voice.synth_utterance(''.join(para), output_wavefile=output_wavefile, \
                             output_labfile=output_labfile)
            prevspace = False
            para = [line]
        else:
            para.append(line)

    if para != []:
        voice.synth_utterance(''.join(para), output_wavefile=output_wavefile, \
                             output_labfile=output_labfile)
        if opts.play:
            os.system('play ' + output_wavefile)
    stop_clock(t)

示例#28

0

显示文件

文件： corpusreader.py 项目： pombredanne/qlc

def export_swadesh_entries(input_path, output_path=None):

    print("Input: {0}".format(input_path))
    print("Ouput: {0}".format(output_path))

    cr = CorpusReaderDict(input_path)
    print("Data loaded")

    files = [ "book.csv",
          "component.csv",
          "corpusversion.csv",
          "dictdata.csv",
          "language_iso.csv",
          "language_bookname.csv",
          "language_src.csv",
          "language_tgt.csv",
          "nondictdata.csv",
          "wordlistdata.csv",
          "wordlistconcept.csv"
        ]
    
    for f in files:
        shutil.copyfile(os.path.join(
            input_path, f), os.path.join(output_path, f))
    
    from nltk.stem.snowball import SpanishStemmer
    stemmer = SpanishStemmer()
    import qlc.utils

    #get stopwords
    stopwords = qlc.utils.stopwords_from_file(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "stopwords", "spa.txt"))

    # load swadesh list
    swadesh_file = codecs.open(os.path.join(os.path.dirname(
        os.path.realpath(
            __file__)), "data", "swadesh", "spa.txt"), "r", "utf-8")

    swadesh_entries = []
    for line in swadesh_file:
        line = line.strip()
        for e in line.split(","):
            stem = stemmer.stem(e)
            swadesh_entries.append(stem)

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    entry_ids = []

    dictdata_ids = cr.dictdata_string_ids
    for dictdata_id in dictdata_ids:
        src_language_iso = cr.src_languages_iso_for_dictdata_id(dictdata_id)
        tgt_language_iso = cr.tgt_languages_iso_for_dictdata_id(dictdata_id)
        # is there some spanish?
        if (src_language_iso != ['spa']) and (tgt_language_iso != ['spa']):
            continue

        for entry_id, head, translation in \
                cr.ids_with_heads_with_translations_for_dictdata_id(
                    dictdata_id):
            if src_language_iso == [ 'spa' ]:
                (head, translation) = (translation, head)

            translation = re.sub(" ?\([^)]\)", "", translation)
            if translation in stopwords:
                entry_ids.append(entry_id)
            else:
                translation = qlc.utils.remove_stopwords(translation, stopwords)
                phrase_stems = qlc.utils.stem_phrase(translation, stemmer, True)
                for stem in phrase_stems:
                    if stem in swadesh_entries:
                        entry_ids.append(entry_id)

    #print(len(entry_ids))
    #return

    input_entry_csv = os.path.join(input_path, "entry.csv")
    output_entry_csv = os.path.join(output_path, "entry.csv")

    input_annotation_csv = os.path.join(input_path, "annotation.csv")
    output_annotation_csv = os.path.join(output_path, "annotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    # cache annotations for lookup
    for i, line in enumerate(fileinput.input(
            input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[
            data[_annotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    
    count_entries = 0
    for i, line in enumerate(fileinput.input(
            input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()
    
    # Worldists
    cr = CorpusReaderWordlist(sys.argv[1])
    print("Data loaded")

    # find all entries that contain one of the swadesh words
    # save entry ids to list
    wordlistdata_ids = cr.wordlistdata_string_ids
    bibtex_keys = collections.defaultdict(list)
    for wid in wordlistdata_ids:
        wordlistdata_string = cr.wordlistdata_string_ids[wid]
        bibtex_key = wordlistdata_string.split("_")[0]
        bibtex_keys[bibtex_key].append(wid)

    wordlistentry_ids = []
    for bibtex_key in bibtex_key:
        # first collect all concepts in this book where the spanish counterpart
        # has one of the swadesh words
        concepts = []
        for wordlistentry_id in wordlistentry_ids:
            language_iso = cr.get_language_code_for_wordlistdata_id(
                wordlistdata_id)
            # is there some spanish?
            if language_iso != ['spa']:
                continue

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):

                counterpart = re.sub(" ?\([^)]\)", "", counterpart)
                if counterpart in stopwords:
                    entry_ids.append(entry_id)
                else:
                    counterpart = qlc.utils.remove_stopwords(
                        counterpart, stopwords)
                    phrase_stems = qlc.utils.stem_phrase(
                        counterpart, stemmer, True)
                    for stem in phrase_stems:
                        if stem in swadesh_entries:
                            concepts.append(concept)

        # now collect the entry ids for those concepts
        for wordlistentry_id in wordlistentry_ids:

            for entry_id, concept, counterpart in \
                    cr.ids_with_concepts_with_counterparts_for_dictdata_id(
                        dictdata_id):
                if concept in concepts:
                    wordlistentry_ids.append(entry_id)
    
    input_entry_csv = os.path.join(input_path, "wordlistentry.csv")
    output_entry_csv = os.path.join(output_path, "wordlistentry.csv")

    input_annotation_csv = os.path.join(input_path, "wordlistannotation.csv")
    output_annotation_csv = os.path.join(output_path, "wordlistannotation.csv")

    output_annotation = codecs.open(output_annotation_csv, "w", "utf-8")

    annotation_dict = collections.defaultdict(list)

    for i, line in enumerate(fileinput.input(input_annotation_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output_annotation.write(line)
            continue
        data = line.strip().split("\t")
        annotation_dict[data[_wordlistannotation_table_columns['entry_id'] + 1]].append(line)
    
    fileinput.nextfile()

    output = codecs.open(output_entry_csv, "w", "utf-8")
    count_entries = 0
    for i, line in enumerate(fileinput.input(input_entry_csv, openhook=fileinput.hook_encoded("utf-8"))):
        if i == 0:
            output.write(line)
            continue
        data = line.strip().split("\t")
        if data[0] in entry_ids:
            output.write(line)
            for annotation_line in annotation_dict[data[0]]:
                output_annotation.write(annotation_line)

    fileinput.nextfile()
    output.close()
    output_annotation.close()

示例#29

0

显示文件

文件： logscanner.py 项目： skizzerz/wolfgame

	def add_line(self, line):
		# Parse out timestamp, nick, and message
		line = line.strip()
		# strip formatting codes
		line = re.sub("\x1f|\x02|\x12|\x0f|\x16|\x03(?:\d{1,2}(?:,\d{1,2})?)?", '', line)
		m = re.match('\[([0-9:]+)\] (\*\*\*|\* [^ ]+|<.+?>) (.*)', line)
		if m == None:
			# blank line perhaps
			m = re.match('\[([0-9:]+)\] (\*\*\*|\* [^ ]+|<.+?>)', line)
			if m == None:
				print ("Unrecognized line " + line)
				return

		time = m.group(1)
		if m.group(2) == "***":
			nick = "**Server**"
			action = False
		else:
			if m.group(2)[0] == "*":
				nick = m.group(2)[2:]
				action = True
			else:
				nick = m.group(2)[1:-1]
				action = False

		if len(m.groups()) == 3:
			message = m.group(3)
		else:
			message = ''

		# Figure out full timestamp from the time + filename
		fname = fileinput.filename()
		m = re.search('([0-9]{4})/?([0-9]{2})/?([0-9]{2})', fname)
		if m == None:
			print ("Could not find timestamp for file " + fname)
			print ("Skipping file...")
			fileinput.nextfile()
			return
		timestamp = "%s%s%s%s%s%s" % (m.group(1), m.group(2), m.group(3), time[0:2], time[3:5], time[6:8])

		# If we or the bot joins/quits, wipe whatever current game is in progress as we lost it
		if nick == "**Server**":
			m = re.match('(?:Joins|Quits|Parts): (.*?) \((.*?)\)', message)
			if m != None:
				if m.group(1) in self.ownnicks or m.group(1) in self.botnicks:
					self.reset()
					return

		# Do we need to start a new game?
		m = re.match('(.*): Welcome to Werewolf, the popular detective/social party game \(a theme of Mafia\)\.', message)
		if m != None:
			if nick in self.botnicks:
				# if we are currently running a game, this is a bug
				if self.game:
					print ("!!! BUG !!! Starting a new game when a game is already running! File: %s Line: %s (started in %s at line %s)" % (fileinput.filename(), fileinput.filelineno(), self.startfile, self.startline))
					exit(1)
				# started a new game, first group is the nicks of who are playing
				self.game = True
				self.id = timestamp
				self.players = m.group(1).split(', ')
				self.gamesize = len(self.players)
				self.startfile = fileinput.filename()
				self.startline = fileinput.filelineno()
				# now set game options (wolfgunner, nightgunner, hiddentraitor)
				# wolfgunner and nightgunner are on if the game took place on or after 9/6/2013
				# hiddentraitor is on if the game took place on or after 2/10/2014
				if int(timestamp) > 20130906000000:
					self.options['wolfgunner'] = True
					self.options['nightgunner'] = True
				if int(timestamp) > 20140210000000:
					self.options['hiddentraitor'] = True
				# check if id already exists in the database, if so we can just skip over this game (saves a lot of processing/regexes)
				# only do this if we aren't updating the schema
				doc = r.table('games').get(self.id).run(self.db)
				if doc and doc['schema'] == self.schema:
					self.skipped += 1
					self.reset()
					return
				elif doc:
					self.replace = True
			elif nick not in self.othernicks:
				self.othernicks.append(nick)
				print ("Possible missed bot nick: %s" % nick)
		
		# If we have a game running, record the line
		if self.game:
			# determine realnick from nickmap
			realnick = self.nickmap.get(nick, nick)
			
			# determine if we need to add something to nickmap
			if nick == "**Server**":
				m = re.match('(.*?) is now known as (.*)', message)
				if m != None:
					oldnick = m.group(1)
					newnick = m.group(2)
					self.nickmap[newnick] = self.nickmap.get(oldnick, oldnick)
					if oldnick in self.botnicks:
						# if it is any of these people, the bot never actually changed
						never = ['Iciloo', 'sid|1']
						if newnick not in never:
							# HALP, THE BOT CHANGED NICKS
							print ("Bot changed nicks from %s to %s" % (oldnick, newnick))
							self.botnicks.append(newnick)
			
			# record the line
			try:
				self.lines.append({'timestamp': timestamp, 'nick': nick, 'realnick': realnick, 'message': unicode(message, 'utf-8'), 'action': action})
			except UnicodeDecodeError:
				# ignore the line
				return

			# If bot said something, figure out the action
			if nick in self.botnicks:
				# record lynches/kills/quits (!quit/kick)/idles (incl part and /quit)/shot
				# on lynch, increment days counter. on kill, increment nights counter
				# on shoot, record current day, who was shot, and shot outcome
				m = re.match('(Day|Night) lasted ([0-9]{2}):([0-9]{2})', message)
				if m != None:
					# increment days or nights (current day is self.days + 1)
					if m.group(1) == 'Day':
						self.days.append(int(m.group(2)) * 60 + int(m.group(3)))
						self.curday += 1
					elif m.group(1) == 'Night':
						self.nights.append(int(m.group(2)) * 60 + int(m.group(3)))
						self.curnight += 1
					return
				
				curday = str(self.curday + 1)
				curnight = str(self.curnight)

				for msg in self.lynchmessages:
					m = re.match(msg, message)
					if m != None:
						if curday not in self.lynched:
							self.lynched[curday] = []
						if len(m.groups()) == 1:
							# have a victim
							self.lynched[curday].append(m.group(1))
						return
				
				for msg in self.killmessages:
					m = re.match(msg, message)
					if m != None:
						if curnight not in self.killed:
							self.killed[curnight] = []
						if len(m.groups()) == 1:
							self.killed[curnight].append(m.group(1))
						return

				for msg in self.shotmessages:
					m = re.match(msg[0], message)
					if m != None:
						if curday not in self.shot:
							self.shot[curday] = []
						if msg[1] == "shoot":
							self.prevtarget = m.group(1)
							return
						elif msg[1] == "explode" or msg[1] == "miss":
							target = self.prevtarget
						else:
							target = m.group(1)
						self.shot[curday].append({'target': target, 'outcome': msg[1]})
						return

				for msg in self.quitmessages:
					m = re.match(msg, message)
					if m != None:
						self.quit.append(m.group(1))
						return

				for msg in self.idlemessages:
					m = re.match(msg, message)
					if m != None:
						self.idled.append(m.group(1))
						return

				# is game over?
				if not self.finished:
					# figure out who won
					m = re.match('(.*) has forced the game to stop', message)
					if m != None:
						# game was !fstopped, so wipe our slate
						self.reset()
						return
					m = re.match('Game over! (All the wolves are dead|There are).*', message)
					if m != None:
						self.finished = True
						if m.group(1) == 'All the wolves are dead':
							self.winner = "village"
						else:
							self.winner = "wolves"
						return
				else:
					# record time and role data and mark game as over
					# time is before role data, so don't mark over until we have both
					m = re.match('Game lasted ([0-9]+):([0-9]+)\. ([0-9]+):([0-9]+) was day\. ([0-9]+):([0-9]+) was night\.', message)
					if m != None:
						self.daytime = int(m.group(3)) * 60 + int(m.group(4))
						self.nighttime = int(m.group(5)) * 60 + int(m.group(6))
						return
					# if we get here, message is the listing of roles
					list = message.split('. ')
					for item in list:
						# remove any trailing periods
						item = item.rstrip('.')
						m = re.match('The (.*?) (were|was) (.*)', item)
						if m == None:
							print ("Failed to match regex to %s" % (message))
							continue
						# variable is a troll and faked some roles, so make sure we only record valid ones
						if m.group(1) not in self.rolemap:
							continue
						# determine the number of nicks in the 3rd group
						if m.group(2) == 'was':
							# one nick only, easy
							self.ruleset[self.rolemap[m.group(1)]] = 1
							self.roles[self.rolemap[m.group(1)]] = [m.group(3)]
						else:
							# multiple nicks, not so easy
							nicks = re.split(', (?:and )?| and ', m.group(3))
							self.ruleset[self.rolemap[m.group(1)]] = len(nicks)
							self.roles[self.rolemap[m.group(1)]] = nicks

					# finally mark the game as over
					self.game = False

示例#30

0

显示文件

文件： nodes.py 项目： xxoolm/Ryven

 def update_event(self, inp=-1):
     self.set_output_val(0, fileinput.nextfile())