def expand_minmax(self): sfile = open(self.destfor, 'r') dfile = open(self.destcmp, 'w') compare_format = Keyword( KEYWORDS['compare']) + '(' + Regex(r'[^\s\(\)]*')('op') + ')' case_var_format = Word(alphas + "_", alphanums + "_" + ".")('var') case_format = Keyword(KEYWORDS['case']) + case_var_format + ":" for line in sfile: if KEYWORDS['compare'] in line: res = compare_format.parseString(line) op = res.op varlist = OrderedDict() while True: l = sfile.readline() if KEYWORDS['endcompare'] in l: break elif KEYWORDS['case'] in l: res = case_format.parseString(l) var = res.var varlist[var] = '' lcase = sfile.readline() content = '' while KEYWORDS['endcase'] not in lcase: content += lcase lcase = sfile.readline() varlist[var] = content self.roll_out_compare(varlist, op, dfile) else: dfile.write(line) sfile.close() dfile.close()
def expand_sync(self): sfile = open(self.destbool, 'r') dfile = open(self.dest, 'w') sync_format = Keyword(KEYWORDS['sync']) + '(' + Regex(r'[_a-zA-Z]*')( "header_name") + "." + Regex(r'[^\s\(\),]*')("field") + "," + Word( nums)("val") + ')' mirror_format = Keyword(KEYWORDS['mirror']) + '(' + Regex( r'[_a-zA-Z]*')("header_name") + "." + Regex(r'[^\s\(\),]*')( "field") + "," + Word(nums)("val") + ')' active_sync, active_mirror = False, False fields, field_name, val = [], None, None for line in sfile: if KEYWORDS['sync'] in line: res = sync_format.parseString(line) field_name, val = res.header_name + '.' + res.field, res.val active_sync = True self.sync_id += 1 elif KEYWORDS['mirror'] in line: res = mirror_format.parseString(line) field_name, val = res.header_name + '.' + res.field, res.val active_mirror = True self.mirror_id += 1 elif KEYWORDS['endsync'] in line: active_sync = False self.write_sync_action(fields, self.sync_id, field_name, val) fields = [] indent = line[:-len(line.lstrip())] dfile.write(APPLY_SYNC_STRING % (indent, self.sync_id)) elif KEYWORDS['endmirror'] in line: active_mirror = False self.write_mirror_action(fields, self.mirror_id, field_name, val) fields = [] indent = line[:-len(line.lstrip())] dfile.write(APPLY_MIRROR_STRING % (indent, self.mirror_id)) elif active_sync or active_mirror: fields.append(line.strip()) else: dfile.write(line) sfile.close() dfile.close()
class DSLParser: def __init__(self, n, colors, shapes, max_constant=5): """ :param n: length of side of the grids :param colors: list of color names :param shapes: list of shape names """ self.colors = Or([Keyword(w) for w in colors]) self.colors ^= Keyword("getMarkerColor()") self.shapes = Or([Keyword(w) for w in shapes]) self.shapes ^= Keyword("getMarkerShape()") self.positions = Or([Keyword(str(i)) for i in range(n)]) self.constants = Or([Keyword(str(i)) for i in range(1, max_constant)]) self.actions = (("move(" + self.positions + "," + self.positions + ")") | "moveUp()" | "moveDown()" | "moveLeft()" | "moveRight()" | "moveTop()" | "moveBottom()" | "moveLeftmost()" | "moveRightmost()" | "moveToMovableMarker()" | "pickMarker()" | "putMarker()" | "fixMarker()") self.conditions = (Group(self.shapes + "==" + self.shapes) | Group(self.colors + "==" + self.colors) | "markersPresent()" | "movableMarkersPresent()" | "existMovableMarkers()" | "upperBoundary()" | "lowerBoundary()" | "leftBoundary()" | "rightBoundary()" | "true") self.conditions = (self.conditions | Group(Keyword("not") + self.conditions)) block = Forward() stmt = (Group( Keyword("while") + "(" + self.conditions + ")" + "{" + Group(block) + "}") | Group( Keyword("repeat") + "(" + self.constants + ")" + "{" + Group(block) + "}") | Group( Keyword("if") + "(" + self.conditions + ")" + "{" + Group(block) + "}") | Group( Keyword("ifelse") + "(" + self.conditions + ")" + "{" + Group(block) + "}" + Keyword("else") + "{" + Group(block) + "}") | Group(self.actions + ";")) block << OneOrMore(stmt) # stmt ^= block self.statements = block self.program = Keyword("def") + Keyword("run()") + "{" \ + Group(self.statements) + "}" def parse_string(self, program): return self.program.parseString(program, parseAll=True)
molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: # find out units of distance in file try: res1 = split_bohr.parseString(line) if res1[1] == 'Bohr': unit_array.append(res1[1]) except Exception: print('No units given in this line') try: res2 = split_angstrom.parseString(line) if res2[1] == 'Angstrom': unit_array.append(res2[1]) except Exception: print('No units given in this line') try: parsed_lines = clt_parser.parseString(line) list_conversion = list(parsed_lines) df = df.append( { 'label': list_conversion[0][0], 'atomic_number': float(list_conversion[1]), 'x': float(list_conversion[2]), 'y': float(list_conversion[3]),
def extract_moments(file_directory): # list to host directory for each unique .mom file mom_files = [] # search through entire file hierarchy to find all .mom files to parse for root, dirs, files in os.walk(file_directory): for i in files: # select only .mom files and add them to list if os.path.splitext(os.path.basename(root + '/' + i))[1] == '.mom': mom_files.append(root + '/' + i) # define parsing grammar # find the df type from .mom file df_parser = Keyword("! Based on DF-type :") + Word(alphas) # parse floating point numbers float_parser = Combine(Optional('-') + Word(nums) + '.' + Word(nums)) mom_parser = OneOrMore(float_parser) # parse a line common to all .mom files with the following structure: ATOM X Y Z Type <ATOM-TYPE> Rank K atom_line = Word(alphas + nums) + mom_parser + OneOrMore(Word(alphas)) + Word(nums) # lists for storing information error_array, df_array, coords, atom = ([],)*4 # json array to compile each atom multipole moment info json_result = {'moments': []} # empty dataframe to host atom information df = pd.DataFrame(data={'atom': [], 'type': [], 'rank': []}) # dictionary to store sorted moment values atom_mom = {} # using pyparsing's 'search string' method for elem in mom_files: # open file file_object = open(elem, 'r') lines = file_object.readlines() mom_name = os.path.basename(elem) mom_name = os.path.splitext(mom_name)[-2] # parse lines for data extraction for line in lines: # get df type first try: res1 = df_parser.parseString(line) df_array.append(res1[1]) except Exception: print('No DF-Type is specified in file') error_array.append('No DF-Type is specified in file') # get information about each atom try: res2 = atom_line.parseString(line) atom.append(res2[0]) type = res2[5] rank = (res2[7]) df = df.append({'atom': atom[len(atom) - 1], 'type': type, 'rank': rank}, ignore_index=True) except Exception: print('This is not a atom type line') # get moment values try: res3 = mom_parser.parseString(line) for i in res3: coords.append(i) except Exception: print('This is not a moment value') coords_float = [] for val in coords: coords_float.append(np.float(val)) # assign the correct values from coords to the right moment value for i in range(len(atom)): # get atom information (+1 is so that Q0 is also counted as well as Q4: Q0, Q1, Q2, Q3, Q4) name = df.iloc[i]['atom'] r = int(df.iloc[i]['rank']) + 1 array, tot, cum_sum = ([],)*3 # find correct number of coordinates to fill each moment configuration with for ii in range(r): s = 2 * ii + 1 tot.append(s) cum_sum.append(np.cumsum(tot).tolist()) cum_sum = cum_sum[len(cum_sum) - 1] total = np.sum(tot) # value in tot is the number of values stored in each Q layer for value in tot: temporary_array = [] # k in range() ensures the correct number of moments is filled in each layer for k in range(value): temporary_array.append(coords_float[k]) array.append(temporary_array) for kkk in temporary_array: coords_float.pop(coords_float.index(kkk)) # fill a dictionary with atoms as keys and moments as values atom_mom[atom[i]] = array # need to loop over elements in dataframe to add each value for every atom in atom_mom, here we need to do # the JSON method too and probably wipe the contents of the dataframe at the beginning of each new loop for idx, kk in enumerate(atom_mom): # need to reshape lists, first, find max value list can be max_len = max(len(i) for i in atom_mom[kk]) # pad short lists with NaN for col in atom_mom[kk]: col.extend((max_len - len(col)) * [np.nan]) # convert to array arr = np.asarray(atom_mom[kk]).T # create indices depending on rank indices = [] for ind in range(1, len(atom_mom[kk])): one = f'{ind}s' two = f'{ind}c' indices.append(one) indices.append(two) indices.insert(0, '0') # create df depending on rank also df_mom = pd.DataFrame( arr, columns=[f'Q{i}' for i in range(0, len(atom_mom[kk]))], index=indices ) moments_string = df_mom.to_string().splitlines() mom_data = { 'atom': atom[idx], 'scheme': mom_name, 'type': df['type'][idx], 'rank': df['rank'][idx], 'moments': moments_string, 'file': elem } json_result['moments'].append({kk: mom_data}) # save this information to a json file called mom_test.json # with open("mom_test.json", "w") as mom_json: # json.dump(json_result, mom_json, indent=4) return json_result
def extract_coordinates(file_directory): # list molecule files: below is an example for arg: clt_files = glob.glob(file_directory + '*.clt') xyz_files = glob.glob(file_directory + '*.xyz') # empty dict to host df's molecules_from_files = {} # definitions of all the parsing tools used clt_parser = Word(alphas + nums) + Word(nums + '.' + nums) + Word(printables + '.' + printables) + \ Word(printables + '.' + printables) + Word(printables + '.' + printables) xyz_parser = Word(alphas) + Word(printables + '.' + printables) + Word(printables + '.' + printables) + \ Word(printables + '.' + printables) bohr = Keyword('Bohr') angstrom = Keyword('Angstrom') word = ~bohr + Word(alphas) sentence = OneOrMore(word) split_bohr = sentence('unit') + bohr + sentence('degree') split_angstrom = Keyword('Units') + angstrom unit_array = [] error_array = [] # empty dataframe to host data df = pd.DataFrame(data={'label': [], 'atomic_number': [], 'x': [], 'y': [], 'z': []}) if len(clt_files) or len(xyz_files) or (len(clt_files) + len(xyz_files)) > 1: # extract information from .clt files for elem in clt_files: # open file file_object = open(elem, 'r') lines = file_object.readlines() molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: # find out units of distance in file try: res1 = split_bohr.parseString(line) if res1[1] == 'Bohr': unit_array.append(res1[1]) except Exception: print('invalid unit line') try: res2 = split_angstrom.parseString(line) if res2[1] == 'Angstrom': unit_array.append(res2[1]) except Exception: print('not valid data line') try: parsed_lines = clt_parser.parseString(line) list_conversion = list(parsed_lines) df = df.append( {'label': list_conversion[0][0], 'atomic_number': float(list_conversion[1]), 'x': float(list_conversion[2]), 'y': float(list_conversion[3]), 'z': float(list_conversion[4])}, ignore_index=True) print('valid data line') except Exception: print('not valid data line') # perform necessary unit conversions if len(unit_array) == 1: if unit_array[0] == 'Angstrom': print('Units are Angstrom') elif unit_array[0] == 'Bohr': df['x'] = df['x'] / 1.89 df['y'] = df['y'] / 1.89 df['z'] = df['z'] / 1.89 # ambiguous case, just have to assume it's bohr but add line in summary file explaining ambiguity elif len(unit_array) == 2: print('ambiguous units') unit_array.clear() unit_array.append('Unknown') error_array.append('Ambiguous unit in file.') molecules_from_files[str(molecule_name)] = df if len(clt_files) > 1: print('there are too many clt files, ambiguous') error_array.append('There are too many .clt files, see file origin above for file used.') # extract information from .xyz files: for elem in xyz_files: # empty dataframe to host data df_2 = pd.DataFrame(data={'label': [], 'atomic_number': [], 'x': [], 'y': [], 'z': []}) # open file file_object = open(elem, 'r') lines = file_object.readlines() molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: try: parsed_lines = xyz_parser.parseString(line) list_conversion = list(parsed_lines) atomic_number_df = atom_database.database[atom_database.database['symbol'] == list_conversion[0]] atomic_number = atomic_number_df.iloc[0][0] df_2 = df_2.append( {'label': list_conversion[0], 'atomic_number': atomic_number, 'x': float(list_conversion[1]), 'y': float(list_conversion[2]), 'z': float(list_conversion[3])}, ignore_index=True) print('valid data line') except Exception: print('not valid data line') molecules_from_files[str(molecule_name)] = df_2 if len(error_array) == 0: error_array.append('No errors') # create a json object that will be saved to a summary file for key, value in molecules_from_files.items(): coord_string = value.to_string().splitlines() coord_data = { 'coordinates': { 'file': file_directory, 'data frame': df.to_json(), 'errors': error_array, 'units': unit_array[0], 'coordinates': coord_string } } # save information to json file called coord_test.json # with open("coord_test.json", "w") as coord_json: # json.dump(coord_data, coord_json, indent=4) else: # extract information from .clt files for elem in clt_files: # open file file_object = open(elem, 'r') lines = file_object.readlines() molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: # find out units of distance in file try: res1 = split_bohr.parseString(line) if res1[1] == 'Bohr': unit_array.append(res1[1]) except Exception: print('invalid unit line') try: res2 = split_angstrom.parseString(line) if res2[1] == 'Angstrom': unit_array.append(res2[1]) except Exception: print('not valid data line') try: parsed_lines = clt_parser.parseString(line) list_conversion = list(parsed_lines) df = df.append( {'label': list_conversion[0][0], 'atomic_number': float(list_conversion[1]), 'x': float(list_conversion[2]), 'y': float(list_conversion[3]), 'z': float(list_conversion[4])}, ignore_index=True) print('valid data line') except Exception: print('not valid data line') # perform necessary unit conversions if len(unit_array) == 1: if unit_array[0] == 'Angstrom': print('Units are Angstrom') elif unit_array[0] == 'Bohr': df['x'] = df['x'] / 1.89 df['y'] = df['y'] / 1.89 df['z'] = df['z'] / 1.89 # ambiguous case, just have to assume it's bohr but add line in summary file explaining ambiguity elif len(unit_array) == 2: print('ambiguous units') unit_array.clear() unit_array.append('Unknown') error_array.append('Ambiguous unit in file.') # extract information from .xyz files: for elem in xyz_files: # empty dataframe to host data df_2 = pd.DataFrame(data={'label': [], 'atomic_number': [], 'x': [], 'y': [], 'z': []}) # open file file_object = open(elem, 'r') lines = file_object.readlines() molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: try: parsed_lines = xyz_parser.parseString(line) list_conversion = list(parsed_lines) atomic_number_df = atom_database.database[atom_database.database['symbol'] == list_conversion[0]] atomic_number = atomic_number_df.iloc[0][0] df_2 = df_2.append( {'label': list_conversion[0], 'atomic_number': atomic_number, 'x': float(list_conversion[1]), 'y': float(list_conversion[2]), 'z': float(list_conversion[3])}, ignore_index=True) print('valid data line') except Exception: print('not valid data line') if len(error_array) == 0: error_array.append('No errors') # create a json object that will be saved to a summary file coord_string = df.to_string().splitlines() coord_data = { 'coordinates': { 'file': file_directory, 'data frame': df.to_json(), 'errors': error_array, 'units': unit_array[0], 'coordinates': coord_string } } # with open("coord_test.json", "w") as coord_json: # json.dump(coord_data, coord_json, indent=4) return coord_data
# using pyparsing's 'search string' method for elem in mom_files: # open file file_object = open(elem, 'r') lines = file_object.readlines() mom_name = os.path.basename(elem) mom_name = os.path.splitext(mom_name)[-2] # parse lines for data extraction for line in lines: # get df type first try: res1 = df_parser.parseString(line) df_array.append(res1[1]) except Exception: print('No DF-Type is specified in file') error_array.append('No DF-Type is specified in file') # get information about each atom try: res2 = atom_line.parseString(line) atom.append(res2[0]) type = res2[5] rank = (res2[7]) df = df.append({'atom': atom[len(atom) - 1], 'type': type, 'rank': rank}, ignore_index=True) except Exception: print('This is not a atom type line')
molecule_name = os.path.basename(elem) molecule_name = os.path.splitext(molecule_name)[-2] # parse lines for data extraction for line in lines: # find out units of distance in file try: res1 = split_bohr.parseString(line) if res1[1] == 'Bohr': unit_array.append(res1[1]) except Exception: print('invalid unit line') try: res2 = split_angrstom.parseString(line) if res2[1] == 'Angstrom': unit_array.append(res2[1]) except Exception: print('not valid data line') try: parsed_lines = clt_parser.parseString(line) list_conversion = list(parsed_lines) df = df.append( { 'label': list_conversion[0][0], 'atomic_number': float(list_conversion[1]), 'x': float(list_conversion[2]), 'y': float(list_conversion[3]),